Commit ·
a5f2f0e
0
Parent(s):
Sync: Agentic app title change. lint check
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .coveragerc +56 -0
- .dockerignore +52 -0
- .gitattributes +9 -0
- .github/scripts/setup_test_data.py +320 -0
- .github/workflow_README.md +183 -0
- .github/workflows/archive_workflows/multi-os-test.yml +115 -0
- .github/workflows/ci.yml +269 -0
- .github/workflows/simple-test.yml +74 -0
- .github/workflows/sync_to_hf.yml +54 -0
- .github/workflows/sync_to_hf_zero_gpu.yml +59 -0
- .gitignore +59 -0
- AGENTS.md +113 -0
- Dockerfile +232 -0
- Dockerfile.pi +40 -0
- MANIFEST.in +4 -0
- README.md +346 -0
- README_PYPI.md +330 -0
- agent_routes.py +1167 -0
- app.py +0 -0
- cdk/__init__.py +0 -0
- cdk/app.py +85 -0
- cdk/cdk.json.example +7 -0
- cdk/cdk_config.py +371 -0
- cdk/cdk_functions.py +1665 -0
- cdk/cdk_stack.py +1991 -0
- cdk/check_resources.py +400 -0
- cdk/lambda_load_dynamo_logs.py +321 -0
- cdk/post_cdk_build_quickstart.py +40 -0
- cdk/requirements.txt +5 -0
- cli_redact.py +0 -0
- doc_redaction/__init__.py +27 -0
- doc_redaction/api.py +43 -0
- doc_redaction/assets/favicon.png +3 -0
- doc_redaction/cli_api.py +405 -0
- doc_redaction/cli_redact.py +26 -0
- doc_redaction/data_anonymise.py +9 -0
- doc_redaction/example_data/Bold minimalist professional cover letter.docx +3 -0
- doc_redaction/example_data/Difficult handwritten note.jpg +3 -0
- doc_redaction/example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf +3 -0
- doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv +0 -0
- doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf +3 -0
- doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv +2 -0
- doc_redaction/example_data/combined_case_notes.csv +19 -0
- doc_redaction/example_data/combined_case_notes.xlsx +3 -0
- doc_redaction/example_data/doubled_output_joined.pdf +3 -0
- doc_redaction/example_data/example_complaint_letter.jpg +3 -0
- doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf +3 -0
- doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv +277 -0
- doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv +77 -0
- doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_output_local_ocr.csv +1241 -0
.coveragerc
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[run]
|
| 2 |
+
source = .
|
| 3 |
+
omit =
|
| 4 |
+
*/tests/*
|
| 5 |
+
*/test/*
|
| 6 |
+
*/__pycache__/*
|
| 7 |
+
*/venv/*
|
| 8 |
+
*/env/*
|
| 9 |
+
*/build/*
|
| 10 |
+
*/dist/*
|
| 11 |
+
*/cdk/*
|
| 12 |
+
*/docs/*
|
| 13 |
+
*/example_data/*
|
| 14 |
+
*/examples/*
|
| 15 |
+
*/feedback/*
|
| 16 |
+
*/logs/*
|
| 17 |
+
*/old_code/*
|
| 18 |
+
*/output/*
|
| 19 |
+
*/tmp/*
|
| 20 |
+
*/usage/*
|
| 21 |
+
*/tld/*
|
| 22 |
+
*/tesseract/*
|
| 23 |
+
*/poppler/*
|
| 24 |
+
config*.py
|
| 25 |
+
setup.py
|
| 26 |
+
lambda_entrypoint.py
|
| 27 |
+
entrypoint.sh
|
| 28 |
+
cli_redact.py
|
| 29 |
+
load_dynamo_logs.py
|
| 30 |
+
load_s3_logs.py
|
| 31 |
+
*.spec
|
| 32 |
+
Dockerfile
|
| 33 |
+
*.qmd
|
| 34 |
+
*.md
|
| 35 |
+
*.txt
|
| 36 |
+
*.yml
|
| 37 |
+
*.yaml
|
| 38 |
+
*.json
|
| 39 |
+
*.csv
|
| 40 |
+
*.env
|
| 41 |
+
*.bat
|
| 42 |
+
*.ps1
|
| 43 |
+
*.sh
|
| 44 |
+
|
| 45 |
+
[report]
|
| 46 |
+
exclude_lines =
|
| 47 |
+
pragma: no cover
|
| 48 |
+
def __repr__
|
| 49 |
+
if self.debug:
|
| 50 |
+
if settings.DEBUG
|
| 51 |
+
raise AssertionError
|
| 52 |
+
raise NotImplementedError
|
| 53 |
+
if 0:
|
| 54 |
+
if __name__ == .__main__.:
|
| 55 |
+
class .*\bProtocol\):
|
| 56 |
+
@(abc\.)?abstractmethod
|
.dockerignore
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.url
|
| 2 |
+
*.ipynb
|
| 3 |
+
*.pyc
|
| 4 |
+
*.qmd
|
| 5 |
+
_quarto.yml
|
| 6 |
+
quarto_site/*
|
| 7 |
+
src/*
|
| 8 |
+
redaction_deps/*
|
| 9 |
+
.venv/*
|
| 10 |
+
examples/*
|
| 11 |
+
processing/*
|
| 12 |
+
tools/__pycache__/*
|
| 13 |
+
old_code/*
|
| 14 |
+
tesseract/*
|
| 15 |
+
poppler/*
|
| 16 |
+
build/*
|
| 17 |
+
dist/*
|
| 18 |
+
docs/*
|
| 19 |
+
.pi/*
|
| 20 |
+
build_deps/*
|
| 21 |
+
user_guide/*
|
| 22 |
+
_extensions/*
|
| 23 |
+
workspace/*
|
| 24 |
+
doc_redaction.egg-info/*
|
| 25 |
+
.venv_pypi_test/*
|
| 26 |
+
cdk/config/*
|
| 27 |
+
tld/*
|
| 28 |
+
cdk/config/*
|
| 29 |
+
cdk/cdk.out/*
|
| 30 |
+
cdk/archive/*
|
| 31 |
+
cdk.json
|
| 32 |
+
cdk.context.json
|
| 33 |
+
.quarto/*
|
| 34 |
+
logs/
|
| 35 |
+
output/
|
| 36 |
+
input/
|
| 37 |
+
feedback/
|
| 38 |
+
config/
|
| 39 |
+
usage/
|
| 40 |
+
test/config/*
|
| 41 |
+
test/feedback/*
|
| 42 |
+
test/input/*
|
| 43 |
+
test/logs/*
|
| 44 |
+
test/output/*
|
| 45 |
+
test/tmp/*
|
| 46 |
+
test/usage/*
|
| 47 |
+
.ruff_cache/*
|
| 48 |
+
model_cache/*
|
| 49 |
+
sanitized_file/*
|
| 50 |
+
src/doc_redaction.egg-info/*
|
| 51 |
+
docker_compose/*
|
| 52 |
+
skills/example_prompts/*
|
.gitattributes
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.sh text eol=lf
|
| 3 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.xls filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.docx filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.doc filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.ico filter=lfs diff=lfs merge=lfs -text
|
.github/scripts/setup_test_data.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Setup script for GitHub Actions test data.
|
| 4 |
+
Creates dummy test files when example data is not available.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def create_directories():
|
| 14 |
+
"""Create necessary directories."""
|
| 15 |
+
dirs = ["doc_redaction/example_data", "doc_redaction/example_data/example_outputs"]
|
| 16 |
+
|
| 17 |
+
for dir_path in dirs:
|
| 18 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 19 |
+
print(f"Created directory: {dir_path}")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def create_dummy_pdf():
|
| 23 |
+
"""Create dummy PDFs for testing."""
|
| 24 |
+
|
| 25 |
+
# Install reportlab if not available
|
| 26 |
+
try:
|
| 27 |
+
from reportlab.lib.pagesizes import letter
|
| 28 |
+
from reportlab.pdfgen import canvas
|
| 29 |
+
except ImportError:
|
| 30 |
+
import subprocess
|
| 31 |
+
|
| 32 |
+
subprocess.check_call(["pip", "install", "reportlab"])
|
| 33 |
+
from reportlab.lib.pagesizes import letter
|
| 34 |
+
from reportlab.pdfgen import canvas
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
# Create the main test PDF
|
| 38 |
+
pdf_path = "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
|
| 39 |
+
print(f"Creating PDF: {pdf_path}")
|
| 40 |
+
print(f"Directory exists: {os.path.exists('doc_redaction/example_data')}")
|
| 41 |
+
|
| 42 |
+
c = canvas.Canvas(pdf_path, pagesize=letter)
|
| 43 |
+
c.drawString(100, 750, "This is a test document for redaction testing.")
|
| 44 |
+
c.drawString(100, 700, "Email: test@example.com")
|
| 45 |
+
c.drawString(100, 650, "Phone: 123-456-7890")
|
| 46 |
+
c.drawString(100, 600, "Name: John Doe")
|
| 47 |
+
c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
|
| 48 |
+
c.showPage()
|
| 49 |
+
|
| 50 |
+
# Add second page
|
| 51 |
+
c.drawString(100, 750, "Second page content")
|
| 52 |
+
c.drawString(100, 700, "More test data: jane.doe@example.com")
|
| 53 |
+
c.drawString(100, 650, "Another phone: 987-654-3210")
|
| 54 |
+
c.save()
|
| 55 |
+
|
| 56 |
+
print(f"Created dummy PDF: {pdf_path}")
|
| 57 |
+
|
| 58 |
+
# Create Partnership Agreement Toolkit PDF
|
| 59 |
+
partnership_pdf_path = (
|
| 60 |
+
"doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf"
|
| 61 |
+
)
|
| 62 |
+
print(f"Creating PDF: {partnership_pdf_path}")
|
| 63 |
+
c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
|
| 64 |
+
c.drawString(100, 750, "Partnership Agreement Toolkit")
|
| 65 |
+
c.drawString(100, 700, "This is a test partnership agreement document.")
|
| 66 |
+
c.drawString(100, 650, "Contact: partnership@example.com")
|
| 67 |
+
c.drawString(100, 600, "Phone: (555) 123-4567")
|
| 68 |
+
c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
|
| 69 |
+
c.showPage()
|
| 70 |
+
|
| 71 |
+
# Add second page
|
| 72 |
+
c.drawString(100, 750, "Page 2 - Partnership Details")
|
| 73 |
+
c.drawString(100, 700, "More partnership information here.")
|
| 74 |
+
c.drawString(100, 650, "Contact: info@partnership.org")
|
| 75 |
+
c.showPage()
|
| 76 |
+
|
| 77 |
+
# Add third page
|
| 78 |
+
c.drawString(100, 750, "Page 3 - Terms and Conditions")
|
| 79 |
+
c.drawString(100, 700, "Terms and conditions content.")
|
| 80 |
+
c.drawString(100, 650, "Legal contact: legal@partnership.org")
|
| 81 |
+
c.save()
|
| 82 |
+
|
| 83 |
+
print(f"Created dummy PDF: {partnership_pdf_path}")
|
| 84 |
+
|
| 85 |
+
# Create Graduate Job Cover Letter PDF
|
| 86 |
+
cover_letter_pdf_path = (
|
| 87 |
+
"doc_redaction/example_data/graduate-job-example-cover-letter.pdf"
|
| 88 |
+
)
|
| 89 |
+
print(f"Creating PDF: {cover_letter_pdf_path}")
|
| 90 |
+
c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
|
| 91 |
+
c.drawString(100, 750, "Cover Letter Example")
|
| 92 |
+
c.drawString(100, 700, "Dear Hiring Manager,")
|
| 93 |
+
c.drawString(100, 650, "I am writing to apply for the position.")
|
| 94 |
+
c.drawString(100, 600, "Contact: applicant@example.com")
|
| 95 |
+
c.drawString(100, 550, "Phone: (555) 987-6543")
|
| 96 |
+
c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
|
| 97 |
+
c.drawString(100, 450, "Sincerely,")
|
| 98 |
+
c.drawString(100, 400, "John Applicant")
|
| 99 |
+
c.save()
|
| 100 |
+
|
| 101 |
+
print(f"Created dummy PDF: {cover_letter_pdf_path}")
|
| 102 |
+
|
| 103 |
+
except ImportError:
|
| 104 |
+
print("ReportLab not available, skipping PDF creation")
|
| 105 |
+
# Create simple text files instead
|
| 106 |
+
with open(
|
| 107 |
+
"doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
|
| 108 |
+
"w",
|
| 109 |
+
) as f:
|
| 110 |
+
f.write("This is a dummy PDF file for testing")
|
| 111 |
+
|
| 112 |
+
with open(
|
| 113 |
+
"doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
|
| 114 |
+
"w",
|
| 115 |
+
) as f:
|
| 116 |
+
f.write("This is a dummy Partnership Agreement PDF file for testing")
|
| 117 |
+
|
| 118 |
+
with open(
|
| 119 |
+
"doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
|
| 120 |
+
"w",
|
| 121 |
+
) as f:
|
| 122 |
+
f.write("This is a dummy cover letter PDF file for testing")
|
| 123 |
+
|
| 124 |
+
print("Created dummy text files instead of PDFs")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def create_dummy_csv():
|
| 128 |
+
"""Create dummy CSV files for testing."""
|
| 129 |
+
# Main CSV
|
| 130 |
+
csv_data = {
|
| 131 |
+
"Case Note": [
|
| 132 |
+
"Client visited for consultation regarding housing issues",
|
| 133 |
+
"Follow-up appointment scheduled for next week",
|
| 134 |
+
"Documentation submitted for review",
|
| 135 |
+
],
|
| 136 |
+
"Client": ["John Smith", "Jane Doe", "Bob Johnson"],
|
| 137 |
+
"Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
|
| 138 |
+
}
|
| 139 |
+
df = pd.DataFrame(csv_data)
|
| 140 |
+
df.to_csv("doc_redaction/example_data/combined_case_notes.csv", index=False)
|
| 141 |
+
print("Created dummy CSV: doc_redaction/example_data/combined_case_notes.csv")
|
| 142 |
+
|
| 143 |
+
# Lambeth CSV
|
| 144 |
+
lambeth_data = {
|
| 145 |
+
"text": [
|
| 146 |
+
"Lambeth 2030 vision document content",
|
| 147 |
+
"Our Future Our Lambeth strategic plan",
|
| 148 |
+
"Community engagement and development",
|
| 149 |
+
],
|
| 150 |
+
"page": [1, 2, 3],
|
| 151 |
+
}
|
| 152 |
+
df_lambeth = pd.DataFrame(lambeth_data)
|
| 153 |
+
df_lambeth.to_csv(
|
| 154 |
+
"doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
|
| 155 |
+
index=False,
|
| 156 |
+
)
|
| 157 |
+
print(
|
| 158 |
+
"Created dummy CSV: doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def create_dummy_word_doc():
|
| 163 |
+
"""Create dummy Word document."""
|
| 164 |
+
try:
|
| 165 |
+
from docx import Document
|
| 166 |
+
|
| 167 |
+
doc = Document()
|
| 168 |
+
doc.add_heading("Test Document for Redaction", 0)
|
| 169 |
+
doc.add_paragraph("This is a test document for redaction testing.")
|
| 170 |
+
doc.add_paragraph("Contact Information:")
|
| 171 |
+
doc.add_paragraph("Email: test@example.com")
|
| 172 |
+
doc.add_paragraph("Phone: 123-456-7890")
|
| 173 |
+
doc.add_paragraph("Name: John Doe")
|
| 174 |
+
doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
|
| 175 |
+
|
| 176 |
+
doc.save(
|
| 177 |
+
"doc_redaction/example_data/Bold minimalist professional cover letter.docx"
|
| 178 |
+
)
|
| 179 |
+
print("Created dummy Word document")
|
| 180 |
+
|
| 181 |
+
except ImportError:
|
| 182 |
+
print("python-docx not available, skipping Word document creation")
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def create_allow_deny_lists():
|
| 186 |
+
"""Create dummy allow/deny lists."""
|
| 187 |
+
# Allow lists
|
| 188 |
+
allow_data = {"word": ["test", "example", "document"]}
|
| 189 |
+
pd.DataFrame(allow_data).to_csv(
|
| 190 |
+
"doc_redaction/example_data/test_allow_list_graduate.csv", index=False
|
| 191 |
+
)
|
| 192 |
+
pd.DataFrame(allow_data).to_csv(
|
| 193 |
+
"doc_redaction/example_data/test_allow_list_partnership.csv", index=False
|
| 194 |
+
)
|
| 195 |
+
print("Created allow lists")
|
| 196 |
+
|
| 197 |
+
# Deny lists
|
| 198 |
+
deny_data = {"word": ["sensitive", "confidential", "private"]}
|
| 199 |
+
pd.DataFrame(deny_data).to_csv(
|
| 200 |
+
"doc_redaction/example_data/partnership_toolkit_redact_custom_deny_list.csv",
|
| 201 |
+
index=False,
|
| 202 |
+
)
|
| 203 |
+
pd.DataFrame(deny_data).to_csv(
|
| 204 |
+
"doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
|
| 205 |
+
index=False,
|
| 206 |
+
)
|
| 207 |
+
print("Created deny lists")
|
| 208 |
+
|
| 209 |
+
# Whole page redaction list
|
| 210 |
+
page_data = {"page": [1, 2]}
|
| 211 |
+
pd.DataFrame(page_data).to_csv(
|
| 212 |
+
"doc_redaction/example_data/partnership_toolkit_redact_some_pages.csv",
|
| 213 |
+
index=False,
|
| 214 |
+
)
|
| 215 |
+
print("Created whole page redaction list")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def create_ocr_output():
|
| 219 |
+
"""Create dummy OCR output CSV."""
|
| 220 |
+
ocr_data = {
|
| 221 |
+
"page": [1, 2, 3],
|
| 222 |
+
"text": [
|
| 223 |
+
"This is page 1 content with some text",
|
| 224 |
+
"This is page 2 content with different text",
|
| 225 |
+
"This is page 3 content with more text",
|
| 226 |
+
],
|
| 227 |
+
"left": [0.1, 0.3, 0.5],
|
| 228 |
+
"top": [0.95, 0.92, 0.88],
|
| 229 |
+
"width": [0.05, 0.02, 0.02],
|
| 230 |
+
"height": [0.01, 0.02, 0.02],
|
| 231 |
+
"line": [1, 2, 3],
|
| 232 |
+
}
|
| 233 |
+
df = pd.DataFrame(ocr_data)
|
| 234 |
+
df.to_csv(
|
| 235 |
+
"doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
|
| 236 |
+
index=False,
|
| 237 |
+
)
|
| 238 |
+
print("Created dummy OCR output CSV")
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def create_dummy_image():
|
| 242 |
+
"""Create dummy image for testing."""
|
| 243 |
+
try:
|
| 244 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 245 |
+
|
| 246 |
+
img = Image.new("RGB", (800, 600), color="white")
|
| 247 |
+
draw = ImageDraw.Draw(img)
|
| 248 |
+
|
| 249 |
+
# Try to use a system font
|
| 250 |
+
try:
|
| 251 |
+
font = ImageFont.truetype(
|
| 252 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
|
| 253 |
+
)
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"Error loading DejaVuSans font: {e}")
|
| 256 |
+
try:
|
| 257 |
+
font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
|
| 258 |
+
except Exception as e:
|
| 259 |
+
print(f"Error loading Arial font: {e}")
|
| 260 |
+
font = ImageFont.load_default()
|
| 261 |
+
|
| 262 |
+
# Add text to image
|
| 263 |
+
draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
|
| 264 |
+
draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
|
| 265 |
+
draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
|
| 266 |
+
draw.text((50, 200), "Name: John Doe", fill="black", font=font)
|
| 267 |
+
draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
|
| 268 |
+
|
| 269 |
+
img.save("doc_redaction/example_data/example_complaint_letter.jpg")
|
| 270 |
+
print("Created dummy image")
|
| 271 |
+
|
| 272 |
+
except ImportError:
|
| 273 |
+
print("PIL not available, skipping image creation")
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def main():
|
| 277 |
+
"""Main setup function."""
|
| 278 |
+
print("Setting up test data for GitHub Actions...")
|
| 279 |
+
print(f"Current working directory: {os.getcwd()}")
|
| 280 |
+
print(f"Python version: {sys.version}")
|
| 281 |
+
|
| 282 |
+
create_directories()
|
| 283 |
+
create_dummy_pdf()
|
| 284 |
+
create_dummy_csv()
|
| 285 |
+
create_dummy_word_doc()
|
| 286 |
+
create_allow_deny_lists()
|
| 287 |
+
create_ocr_output()
|
| 288 |
+
create_dummy_image()
|
| 289 |
+
|
| 290 |
+
print("\nTest data setup complete!")
|
| 291 |
+
print("Created files:")
|
| 292 |
+
for root, dirs, files in os.walk("doc_redaction/example_data"):
|
| 293 |
+
for file in files:
|
| 294 |
+
file_path = os.path.join(root, file)
|
| 295 |
+
print(f" {file_path}")
|
| 296 |
+
# Verify the file exists and has content
|
| 297 |
+
if os.path.exists(file_path):
|
| 298 |
+
file_size = os.path.getsize(file_path)
|
| 299 |
+
print(f" Size: {file_size} bytes")
|
| 300 |
+
else:
|
| 301 |
+
print(" WARNING: File does not exist!")
|
| 302 |
+
|
| 303 |
+
# Verify critical files exist
|
| 304 |
+
critical_files = [
|
| 305 |
+
"doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
|
| 306 |
+
"doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
|
| 307 |
+
"doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
|
| 308 |
+
]
|
| 309 |
+
|
| 310 |
+
print("\nVerifying critical test files:")
|
| 311 |
+
for file_path in critical_files:
|
| 312 |
+
if os.path.exists(file_path):
|
| 313 |
+
file_size = os.path.getsize(file_path)
|
| 314 |
+
print(f"✅ {file_path} exists ({file_size} bytes)")
|
| 315 |
+
else:
|
| 316 |
+
print(f"❌ {file_path} MISSING!")
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
if __name__ == "__main__":
|
| 320 |
+
main()
|
.github/workflow_README.md
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GitHub Actions CI/CD Setup
|
| 2 |
+
|
| 3 |
+
This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
|
| 4 |
+
|
| 5 |
+
## Workflows Overview
|
| 6 |
+
|
| 7 |
+
### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
|
| 8 |
+
- **Purpose**: Basic test execution
|
| 9 |
+
- **Triggers**: Push to main/dev, Pull requests
|
| 10 |
+
- **OS**: Ubuntu Latest
|
| 11 |
+
- **Python**: 3.11
|
| 12 |
+
- **Features**:
|
| 13 |
+
- Installs system dependencies
|
| 14 |
+
- Sets up test data
|
| 15 |
+
- Runs CLI tests
|
| 16 |
+
- Runs pytest
|
| 17 |
+
|
| 18 |
+
### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
|
| 19 |
+
- **Purpose**: Full CI/CD pipeline
|
| 20 |
+
- **Features**:
|
| 21 |
+
- Linting (Ruff, Black)
|
| 22 |
+
- Unit tests (Python 3.10, 3.11, 3.12)
|
| 23 |
+
- Integration tests
|
| 24 |
+
- Security scanning (Safety, Bandit)
|
| 25 |
+
- Coverage reporting
|
| 26 |
+
- Package building (on main branch)
|
| 27 |
+
|
| 28 |
+
### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
|
| 29 |
+
- **Purpose**: Cross-platform testing
|
| 30 |
+
- **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
|
| 31 |
+
- **Python**: 3.10, 3.11, 3.12
|
| 32 |
+
- **Features**: Tests compatibility across different operating systems
|
| 33 |
+
|
| 34 |
+
### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
|
| 35 |
+
- **Purpose**: Original test workflow
|
| 36 |
+
- **Features**:
|
| 37 |
+
- Multiple Python versions
|
| 38 |
+
- System dependency installation
|
| 39 |
+
- Test data creation
|
| 40 |
+
- Coverage reporting
|
| 41 |
+
|
| 42 |
+
## Setup Scripts
|
| 43 |
+
|
| 44 |
+
### Test Data Setup (`.github/scripts/setup_test_data.py`)
|
| 45 |
+
Creates dummy test files when example data is not available:
|
| 46 |
+
- PDF documents
|
| 47 |
+
- CSV files
|
| 48 |
+
- Word documents
|
| 49 |
+
- Images
|
| 50 |
+
- Allow/deny lists
|
| 51 |
+
- OCR output files
|
| 52 |
+
|
| 53 |
+
## Usage
|
| 54 |
+
|
| 55 |
+
### Running Tests Locally
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
# Install dependencies
|
| 59 |
+
pip install -r requirements.txt
|
| 60 |
+
pip install pytest pytest-cov
|
| 61 |
+
|
| 62 |
+
# Setup test data
|
| 63 |
+
python .github/scripts/setup_test_data.py
|
| 64 |
+
|
| 65 |
+
# Run tests
|
| 66 |
+
cd test
|
| 67 |
+
python cli_epilog_suite.py
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
### GitHub Actions Triggers
|
| 71 |
+
|
| 72 |
+
1. **Push to main/dev**: Runs all tests
|
| 73 |
+
2. **Pull Request**: Runs tests and linting
|
| 74 |
+
3. **Daily Schedule**: Runs tests at 2 AM UTC
|
| 75 |
+
4. **Manual Trigger**: Can be triggered manually from GitHub
|
| 76 |
+
|
| 77 |
+
## Configuration
|
| 78 |
+
|
| 79 |
+
### Environment Variables
|
| 80 |
+
- `PYTHON_VERSION`: Default Python version (3.11)
|
| 81 |
+
- `PYTHONPATH`: Set automatically for test discovery
|
| 82 |
+
|
| 83 |
+
### Caching
|
| 84 |
+
- Pip dependencies are cached for faster builds
|
| 85 |
+
- Cache key based on requirements.txt hash
|
| 86 |
+
|
| 87 |
+
### Artifacts
|
| 88 |
+
- Test results (JUnit XML)
|
| 89 |
+
- Coverage reports (HTML, XML)
|
| 90 |
+
- Security reports
|
| 91 |
+
- Build artifacts (on main branch)
|
| 92 |
+
|
| 93 |
+
## Test Data
|
| 94 |
+
|
| 95 |
+
The workflows automatically create test data when example files are missing:
|
| 96 |
+
|
| 97 |
+
### Required Files Created:
|
| 98 |
+
- `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
|
| 99 |
+
- `example_data/combined_case_notes.csv`
|
| 100 |
+
- `example_data/Bold minimalist professional cover letter.docx`
|
| 101 |
+
- `example_data/example_complaint_letter.jpg`
|
| 102 |
+
- `example_data/test_allow_list_*.csv`
|
| 103 |
+
- `example_data/partnership_toolkit_redact_*.csv`
|
| 104 |
+
- `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
|
| 105 |
+
|
| 106 |
+
### Dependencies Installed:
|
| 107 |
+
- **System**: tesseract-ocr, poppler-utils, OpenGL libraries
|
| 108 |
+
- **Python**: All requirements.txt packages + pytest, reportlab, pillow
|
| 109 |
+
|
| 110 |
+
## Workflow Status
|
| 111 |
+
|
| 112 |
+
### Success Criteria:
|
| 113 |
+
- ✅ All tests pass
|
| 114 |
+
- ✅ No linting errors
|
| 115 |
+
- ✅ Security checks pass
|
| 116 |
+
- ✅ Coverage meets threshold (if configured)
|
| 117 |
+
|
| 118 |
+
### Failure Handling:
|
| 119 |
+
- Tests are designed to skip gracefully if files are missing
|
| 120 |
+
- AWS tests are expected to fail without credentials
|
| 121 |
+
- System dependency failures are handled with fallbacks
|
| 122 |
+
|
| 123 |
+
## Customization
|
| 124 |
+
|
| 125 |
+
### Adding New Tests:
|
| 126 |
+
1. Add test methods to `test/cli_epilog_suite.py` or pytest files under `test/test_*.py`
|
| 127 |
+
2. Update test data in `setup_test_data.py` if needed
|
| 128 |
+
3. Tests will automatically run in all workflows
|
| 129 |
+
|
| 130 |
+
### Modifying Workflows:
|
| 131 |
+
1. Edit the appropriate `.yml` file
|
| 132 |
+
2. Test locally first
|
| 133 |
+
3. Push to trigger the workflow
|
| 134 |
+
|
| 135 |
+
### Environment-Specific Settings:
|
| 136 |
+
- **Ubuntu**: Full system dependencies
|
| 137 |
+
- **Windows**: Python packages only
|
| 138 |
+
- **macOS**: Homebrew dependencies
|
| 139 |
+
|
| 140 |
+
## Troubleshooting
|
| 141 |
+
|
| 142 |
+
### Common Issues:
|
| 143 |
+
|
| 144 |
+
1. **Missing Dependencies**:
|
| 145 |
+
- Check system dependency installation
|
| 146 |
+
- Verify Python package versions
|
| 147 |
+
|
| 148 |
+
2. **Test Failures**:
|
| 149 |
+
- Check test data creation
|
| 150 |
+
- Verify file paths
|
| 151 |
+
- Review test output logs
|
| 152 |
+
|
| 153 |
+
3. **AWS Test Failures**:
|
| 154 |
+
- Expected without credentials
|
| 155 |
+
- Tests are designed to handle this gracefully
|
| 156 |
+
|
| 157 |
+
4. **System Dependency Issues**:
|
| 158 |
+
- Different OS have different requirements
|
| 159 |
+
- Check the specific OS section in workflows
|
| 160 |
+
|
| 161 |
+
### Debug Mode:
|
| 162 |
+
Add `--verbose` or `-v` flags to pytest commands for more detailed output.
|
| 163 |
+
|
| 164 |
+
## Security
|
| 165 |
+
|
| 166 |
+
- Dependencies are scanned with Safety
|
| 167 |
+
- Code is scanned with Bandit
|
| 168 |
+
- No secrets are exposed in logs
|
| 169 |
+
- Test data is temporary and cleaned up
|
| 170 |
+
|
| 171 |
+
## Performance
|
| 172 |
+
|
| 173 |
+
- Tests run in parallel where possible
|
| 174 |
+
- Dependencies are cached
|
| 175 |
+
- Only necessary system packages are installed
|
| 176 |
+
- Test data is created efficiently
|
| 177 |
+
|
| 178 |
+
## Monitoring
|
| 179 |
+
|
| 180 |
+
- Workflow status is visible in GitHub Actions tab
|
| 181 |
+
- Coverage reports are uploaded to Codecov
|
| 182 |
+
- Test results are available as artifacts
|
| 183 |
+
- Security reports are generated and stored
|
.github/workflows/archive_workflows/multi-os-test.yml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Multi-OS Test
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
|
| 9 |
+
permissions:
|
| 10 |
+
contents: read
|
| 11 |
+
actions: read
|
| 12 |
+
|
| 13 |
+
jobs:
|
| 14 |
+
test:
|
| 15 |
+
runs-on: ${{ matrix.os }}
|
| 16 |
+
env:
|
| 17 |
+
SHOW_VLM_MODEL_OPTIONS: "False"
|
| 18 |
+
strategy:
|
| 19 |
+
matrix:
|
| 20 |
+
os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
|
| 21 |
+
python-version: ["3.11", "3.12", "3.13"]
|
| 22 |
+
exclude:
|
| 23 |
+
# Exclude some combinations to reduce CI time
|
| 24 |
+
#- os: windows-latest
|
| 25 |
+
# python-version: ["3.12", "3.13"]
|
| 26 |
+
- os: macos-latest
|
| 27 |
+
python-version: ["3.12", "3.13"]
|
| 28 |
+
|
| 29 |
+
steps:
|
| 30 |
+
- uses: actions/checkout@v6
|
| 31 |
+
|
| 32 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 33 |
+
uses: actions/setup-python@v6
|
| 34 |
+
with:
|
| 35 |
+
python-version: ${{ matrix.python-version }}
|
| 36 |
+
|
| 37 |
+
- name: Install system dependencies (Ubuntu)
|
| 38 |
+
if: matrix.os == 'ubuntu-latest'
|
| 39 |
+
run: |
|
| 40 |
+
sudo apt-get update
|
| 41 |
+
sudo apt-get install -y \
|
| 42 |
+
tesseract-ocr \
|
| 43 |
+
tesseract-ocr-eng \
|
| 44 |
+
poppler-utils \
|
| 45 |
+
libgl1-mesa-dri \
|
| 46 |
+
libglib2.0-0 \
|
| 47 |
+
libsm6 \
|
| 48 |
+
libxext6 \
|
| 49 |
+
libxrender-dev \
|
| 50 |
+
libgomp1
|
| 51 |
+
|
| 52 |
+
- name: Install system dependencies (macOS)
|
| 53 |
+
if: matrix.os == 'macos-latest'
|
| 54 |
+
run: |
|
| 55 |
+
brew install tesseract poppler
|
| 56 |
+
|
| 57 |
+
- name: Install system dependencies (Windows)
|
| 58 |
+
if: matrix.os == 'windows-latest'
|
| 59 |
+
run: |
|
| 60 |
+
# Create tools directory
|
| 61 |
+
if (!(Test-Path "C:\tools")) {
|
| 62 |
+
mkdir C:\tools
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Download and install Tesseract
|
| 66 |
+
$tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
|
| 67 |
+
$tesseractInstaller = "C:\tools\tesseract-installer.exe"
|
| 68 |
+
Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
|
| 69 |
+
|
| 70 |
+
# Install Tesseract silently
|
| 71 |
+
Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
|
| 72 |
+
|
| 73 |
+
# Download and extract Poppler
|
| 74 |
+
$popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
|
| 75 |
+
$popplerZip = "C:\tools\poppler.zip"
|
| 76 |
+
Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
|
| 77 |
+
|
| 78 |
+
# Extract Poppler
|
| 79 |
+
Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
|
| 80 |
+
|
| 81 |
+
# Add to PATH
|
| 82 |
+
echo "C:\tools\tesseract" >> $env:GITHUB_PATH
|
| 83 |
+
echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
|
| 84 |
+
|
| 85 |
+
# Set environment variables for your application
|
| 86 |
+
echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
|
| 87 |
+
echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
|
| 88 |
+
echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
|
| 89 |
+
|
| 90 |
+
# Verify installation using full paths (since PATH won't be updated in current session)
|
| 91 |
+
& "C:\tools\tesseract\tesseract.exe" --version
|
| 92 |
+
& "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
|
| 93 |
+
|
| 94 |
+
- name: Install Python dependencies
|
| 95 |
+
run: |
|
| 96 |
+
python -m pip install --upgrade pip
|
| 97 |
+
pip install -r requirements.txt
|
| 98 |
+
pip install pytest pytest-cov reportlab pillow
|
| 99 |
+
|
| 100 |
+
- name: Download spaCy model
|
| 101 |
+
run: |
|
| 102 |
+
python -m spacy download en_core_web_lg
|
| 103 |
+
|
| 104 |
+
- name: Setup test data
|
| 105 |
+
run: |
|
| 106 |
+
python .github/scripts/setup_test_data.py
|
| 107 |
+
|
| 108 |
+
- name: Run CLI tests
|
| 109 |
+
run: |
|
| 110 |
+
cd test
|
| 111 |
+
python cli_epilog_suite.py
|
| 112 |
+
|
| 113 |
+
- name: Run tests with pytest
|
| 114 |
+
run: |
|
| 115 |
+
pytest test/ -v --tb=short
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI/CD Pipeline
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
workflow_dispatch:
|
| 9 |
+
#schedule:
|
| 10 |
+
# Run tests daily at 2 AM UTC
|
| 11 |
+
# - cron: '0 2 * * *'
|
| 12 |
+
|
| 13 |
+
permissions:
|
| 14 |
+
contents: read
|
| 15 |
+
actions: read
|
| 16 |
+
pull-requests: write
|
| 17 |
+
issues: write
|
| 18 |
+
|
| 19 |
+
env:
|
| 20 |
+
PYTHON_VERSION: "3.11"
|
| 21 |
+
|
| 22 |
+
jobs:
|
| 23 |
+
lint:
|
| 24 |
+
runs-on: ubuntu-latest
|
| 25 |
+
steps:
|
| 26 |
+
- uses: actions/checkout@v6
|
| 27 |
+
|
| 28 |
+
- name: Set up Python
|
| 29 |
+
uses: actions/setup-python@v6
|
| 30 |
+
with:
|
| 31 |
+
python-version: ${{ env.PYTHON_VERSION }}
|
| 32 |
+
|
| 33 |
+
- name: Install dependencies
|
| 34 |
+
run: |
|
| 35 |
+
python -m pip install --upgrade pip
|
| 36 |
+
pip install ruff black
|
| 37 |
+
|
| 38 |
+
- name: Run Ruff linter
|
| 39 |
+
run: ruff check .
|
| 40 |
+
|
| 41 |
+
- name: Run Black formatter check
|
| 42 |
+
run: black --check .
|
| 43 |
+
|
| 44 |
+
test-unit:
|
| 45 |
+
runs-on: ubuntu-latest
|
| 46 |
+
env:
|
| 47 |
+
# Avoid optional VLM/torch import path in tools.run_vlm (not installed in lightweight CI deps)
|
| 48 |
+
SHOW_VLM_MODEL_OPTIONS: "False"
|
| 49 |
+
strategy:
|
| 50 |
+
matrix:
|
| 51 |
+
python-version: [3.11, 3.12, 3.13]
|
| 52 |
+
|
| 53 |
+
steps:
|
| 54 |
+
- uses: actions/checkout@v6
|
| 55 |
+
|
| 56 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 57 |
+
uses: actions/setup-python@v6
|
| 58 |
+
with:
|
| 59 |
+
python-version: ${{ matrix.python-version }}
|
| 60 |
+
|
| 61 |
+
- name: Cache pip dependencies
|
| 62 |
+
uses: actions/cache@v5
|
| 63 |
+
with:
|
| 64 |
+
path: ~/.cache/pip
|
| 65 |
+
key: ${{ runner.os }}-pip-${{ hashFiles('requirements_lightweight.txt') }}
|
| 66 |
+
restore-keys: |
|
| 67 |
+
${{ runner.os }}-pip-
|
| 68 |
+
|
| 69 |
+
- name: Install system dependencies
|
| 70 |
+
run: |
|
| 71 |
+
sudo apt-get update
|
| 72 |
+
sudo apt-get install -y \
|
| 73 |
+
tesseract-ocr \
|
| 74 |
+
tesseract-ocr-eng \
|
| 75 |
+
poppler-utils \
|
| 76 |
+
libgl1-mesa-dri \
|
| 77 |
+
libglib2.0-0 \
|
| 78 |
+
libsm6 \
|
| 79 |
+
libxext6 \
|
| 80 |
+
libxrender-dev \
|
| 81 |
+
libgomp1
|
| 82 |
+
|
| 83 |
+
- name: Install Python dependencies
|
| 84 |
+
run: |
|
| 85 |
+
python -m pip install --upgrade pip
|
| 86 |
+
pip install -r requirements_lightweight.txt
|
| 87 |
+
pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
|
| 88 |
+
|
| 89 |
+
- name: Download spaCy model
|
| 90 |
+
run: |
|
| 91 |
+
python -m spacy download en_core_web_lg
|
| 92 |
+
|
| 93 |
+
- name: Setup test data
|
| 94 |
+
run: |
|
| 95 |
+
python .github/scripts/setup_test_data.py
|
| 96 |
+
echo "Setup script completed. Checking results:"
|
| 97 |
+
ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
|
| 98 |
+
|
| 99 |
+
- name: Verify test data files
|
| 100 |
+
run: |
|
| 101 |
+
echo "Checking if critical test files exist:"
|
| 102 |
+
ls -la doc_redaction/example_data/
|
| 103 |
+
echo "Checking for specific PDF files:"
|
| 104 |
+
ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
|
| 105 |
+
echo "Checking file sizes:"
|
| 106 |
+
find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
|
| 107 |
+
|
| 108 |
+
- name: Clean up problematic config files
|
| 109 |
+
run: |
|
| 110 |
+
rm -f config*.py || true
|
| 111 |
+
|
| 112 |
+
- name: Run CLI tests
|
| 113 |
+
run: |
|
| 114 |
+
cd test
|
| 115 |
+
python cli_epilog_suite.py
|
| 116 |
+
|
| 117 |
+
- name: Run tests with pytest (JUnit and coverage)
|
| 118 |
+
run: |
|
| 119 |
+
pytest test/ -v --tb=short \
|
| 120 |
+
--junitxml=test-results.xml \
|
| 121 |
+
--cov=. --cov-config=.coveragerc \
|
| 122 |
+
--cov-report=xml --cov-report=html --cov-report=term
|
| 123 |
+
|
| 124 |
+
#- name: Upload coverage to Codecov - not necessary
|
| 125 |
+
# uses: codecov/codecov-action@v3
|
| 126 |
+
# if: matrix.python-version == '3.11'
|
| 127 |
+
# with:
|
| 128 |
+
# file: ./coverage.xml
|
| 129 |
+
# flags: unittests
|
| 130 |
+
# name: codecov-umbrella
|
| 131 |
+
# fail_ci_if_error: false
|
| 132 |
+
|
| 133 |
+
- name: Upload test results
|
| 134 |
+
uses: actions/upload-artifact@v6
|
| 135 |
+
if: always()
|
| 136 |
+
with:
|
| 137 |
+
name: test-results-python-${{ matrix.python-version }}
|
| 138 |
+
path: |
|
| 139 |
+
test-results.xml
|
| 140 |
+
htmlcov/
|
| 141 |
+
coverage.xml
|
| 142 |
+
|
| 143 |
+
test-integration:
|
| 144 |
+
runs-on: ubuntu-latest
|
| 145 |
+
needs: [lint, test-unit]
|
| 146 |
+
env:
|
| 147 |
+
SHOW_VLM_MODEL_OPTIONS: "False"
|
| 148 |
+
|
| 149 |
+
steps:
|
| 150 |
+
- uses: actions/checkout@v6
|
| 151 |
+
|
| 152 |
+
- name: Set up Python
|
| 153 |
+
uses: actions/setup-python@v6
|
| 154 |
+
with:
|
| 155 |
+
python-version: ${{ env.PYTHON_VERSION }}
|
| 156 |
+
|
| 157 |
+
- name: Install dependencies
|
| 158 |
+
run: |
|
| 159 |
+
python -m pip install --upgrade pip
|
| 160 |
+
pip install -r requirements_lightweight.txt
|
| 161 |
+
pip install pytest pytest-cov reportlab pillow
|
| 162 |
+
|
| 163 |
+
- name: Install system dependencies
|
| 164 |
+
run: |
|
| 165 |
+
sudo apt-get update
|
| 166 |
+
sudo apt-get install -y \
|
| 167 |
+
tesseract-ocr \
|
| 168 |
+
tesseract-ocr-eng \
|
| 169 |
+
poppler-utils \
|
| 170 |
+
libgl1-mesa-dri \
|
| 171 |
+
libglib2.0-0 \
|
| 172 |
+
libsm6 \
|
| 173 |
+
libxext6 \
|
| 174 |
+
libxrender-dev \
|
| 175 |
+
libgomp1
|
| 176 |
+
|
| 177 |
+
- name: Download spaCy model
|
| 178 |
+
run: |
|
| 179 |
+
python -m spacy download en_core_web_lg
|
| 180 |
+
|
| 181 |
+
- name: Setup test data
|
| 182 |
+
run: |
|
| 183 |
+
python .github/scripts/setup_test_data.py
|
| 184 |
+
echo "Setup script completed. Checking results:"
|
| 185 |
+
ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
|
| 186 |
+
|
| 187 |
+
- name: Verify test data files
|
| 188 |
+
run: |
|
| 189 |
+
echo "Checking if critical test files exist:"
|
| 190 |
+
ls -la doc_redaction/example_data/
|
| 191 |
+
echo "Checking for specific PDF files:"
|
| 192 |
+
ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
|
| 193 |
+
echo "Checking file sizes:"
|
| 194 |
+
find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
|
| 195 |
+
|
| 196 |
+
- name: Run integration tests
|
| 197 |
+
run: |
|
| 198 |
+
cd test
|
| 199 |
+
python demo_single_test.py
|
| 200 |
+
|
| 201 |
+
- name: Test CLI help
|
| 202 |
+
run: |
|
| 203 |
+
python cli_redact.py --help
|
| 204 |
+
|
| 205 |
+
- name: Test CLI version
|
| 206 |
+
run: |
|
| 207 |
+
python -c "import sys; print(f'Python {sys.version}')"
|
| 208 |
+
|
| 209 |
+
security:
|
| 210 |
+
runs-on: ubuntu-latest
|
| 211 |
+
steps:
|
| 212 |
+
- uses: actions/checkout@v6
|
| 213 |
+
|
| 214 |
+
- name: Set up Python
|
| 215 |
+
uses: actions/setup-python@v6
|
| 216 |
+
with:
|
| 217 |
+
python-version: ${{ env.PYTHON_VERSION }}
|
| 218 |
+
|
| 219 |
+
- name: Install dependencies
|
| 220 |
+
run: |
|
| 221 |
+
python -m pip install --upgrade pip
|
| 222 |
+
pip install safety bandit
|
| 223 |
+
|
| 224 |
+
#- name: Run safety scan - removed as now requires login
|
| 225 |
+
# run: |
|
| 226 |
+
# safety scan -r requirements.txt
|
| 227 |
+
|
| 228 |
+
- name: Run bandit security check
|
| 229 |
+
run: |
|
| 230 |
+
bandit -r . -f json -o bandit-report.json || true
|
| 231 |
+
|
| 232 |
+
- name: Upload security report
|
| 233 |
+
uses: actions/upload-artifact@v6
|
| 234 |
+
if: always()
|
| 235 |
+
with:
|
| 236 |
+
name: security-report
|
| 237 |
+
path: bandit-report.json
|
| 238 |
+
|
| 239 |
+
build:
|
| 240 |
+
runs-on: ubuntu-latest
|
| 241 |
+
needs: [lint, test-unit]
|
| 242 |
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
| 243 |
+
|
| 244 |
+
steps:
|
| 245 |
+
- uses: actions/checkout@v6
|
| 246 |
+
|
| 247 |
+
- name: Set up Python
|
| 248 |
+
uses: actions/setup-python@v6
|
| 249 |
+
with:
|
| 250 |
+
python-version: ${{ env.PYTHON_VERSION }}
|
| 251 |
+
|
| 252 |
+
- name: Install build dependencies
|
| 253 |
+
run: |
|
| 254 |
+
python -m pip install --upgrade pip
|
| 255 |
+
pip install build twine
|
| 256 |
+
|
| 257 |
+
- name: Build package
|
| 258 |
+
run: |
|
| 259 |
+
python -m build
|
| 260 |
+
|
| 261 |
+
- name: Check package
|
| 262 |
+
run: |
|
| 263 |
+
twine check dist/*
|
| 264 |
+
|
| 265 |
+
- name: Upload build artifacts
|
| 266 |
+
uses: actions/upload-artifact@v6
|
| 267 |
+
with:
|
| 268 |
+
name: dist
|
| 269 |
+
path: dist/
|
.github/workflows/simple-test.yml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Simple Test Run
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ dev ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ dev ]
|
| 8 |
+
workflow_dispatch:
|
| 9 |
+
|
| 10 |
+
permissions:
|
| 11 |
+
contents: read
|
| 12 |
+
actions: read
|
| 13 |
+
|
| 14 |
+
jobs:
|
| 15 |
+
test:
|
| 16 |
+
runs-on: ubuntu-latest
|
| 17 |
+
env:
|
| 18 |
+
SHOW_VLM_MODEL_OPTIONS: "False"
|
| 19 |
+
|
| 20 |
+
steps:
|
| 21 |
+
- uses: actions/checkout@v6
|
| 22 |
+
|
| 23 |
+
- name: Set up Python 3.12
|
| 24 |
+
uses: actions/setup-python@v6
|
| 25 |
+
with:
|
| 26 |
+
python-version: "3.12"
|
| 27 |
+
|
| 28 |
+
- name: Install system dependencies
|
| 29 |
+
run: |
|
| 30 |
+
sudo apt-get update
|
| 31 |
+
sudo apt-get install -y \
|
| 32 |
+
tesseract-ocr \
|
| 33 |
+
tesseract-ocr-eng \
|
| 34 |
+
poppler-utils \
|
| 35 |
+
libgl1-mesa-dri \
|
| 36 |
+
libglib2.0-0 \
|
| 37 |
+
libsm6 \
|
| 38 |
+
libxext6 \
|
| 39 |
+
libxrender-dev \
|
| 40 |
+
libgomp1
|
| 41 |
+
|
| 42 |
+
- name: Install Python dependencies
|
| 43 |
+
run: |
|
| 44 |
+
python -m pip install --upgrade pip
|
| 45 |
+
pip install -r requirements_lightweight.txt
|
| 46 |
+
pip install pytest pytest-cov reportlab pillow
|
| 47 |
+
|
| 48 |
+
- name: Download spaCy model
|
| 49 |
+
run: |
|
| 50 |
+
python -m spacy download en_core_web_lg
|
| 51 |
+
|
| 52 |
+
- name: Setup test data
|
| 53 |
+
run: |
|
| 54 |
+
python .github/scripts/setup_test_data.py
|
| 55 |
+
echo "Setup script completed. Checking results:"
|
| 56 |
+
ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
|
| 57 |
+
|
| 58 |
+
- name: Verify test data files
|
| 59 |
+
run: |
|
| 60 |
+
echo "Checking if critical test files exist:"
|
| 61 |
+
ls -la doc_redaction/example_data/
|
| 62 |
+
echo "Checking for specific PDF files:"
|
| 63 |
+
ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
|
| 64 |
+
echo "Checking file sizes:"
|
| 65 |
+
find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
|
| 66 |
+
|
| 67 |
+
- name: Run CLI tests
|
| 68 |
+
run: |
|
| 69 |
+
cd test
|
| 70 |
+
python cli_epilog_suite.py
|
| 71 |
+
|
| 72 |
+
- name: Run tests with pytest
|
| 73 |
+
run: |
|
| 74 |
+
pytest test/ -v --tb=short
|
.github/workflows/sync_to_hf.yml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [dev]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
permissions:
|
| 8 |
+
contents: read
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
sync-to-hub:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v6
|
| 15 |
+
with:
|
| 16 |
+
fetch-depth: 1 # Only get the latest state
|
| 17 |
+
lfs: true # Download actual LFS files so they can be pushed
|
| 18 |
+
|
| 19 |
+
- name: Install Git LFS
|
| 20 |
+
run: git lfs install
|
| 21 |
+
|
| 22 |
+
- name: Recreate repo history (single-commit force push)
|
| 23 |
+
run: |
|
| 24 |
+
# 1. Capture the message BEFORE we delete the .git folder
|
| 25 |
+
COMMIT_MSG=$(git log -1 --pretty=%B)
|
| 26 |
+
echo "Syncing commit message: $COMMIT_MSG"
|
| 27 |
+
|
| 28 |
+
# 2. DELETE the .git folder.
|
| 29 |
+
# This turns the repo into a standard folder of files.
|
| 30 |
+
rm -rf .git
|
| 31 |
+
|
| 32 |
+
# 3. Re-initialize a brand new git repo
|
| 33 |
+
git init -b main
|
| 34 |
+
git config --global user.name "$HF_USERNAME"
|
| 35 |
+
git config --global user.email "$HF_EMAIL"
|
| 36 |
+
|
| 37 |
+
# 4. Re-install LFS (needs to be done after git init)
|
| 38 |
+
git lfs install
|
| 39 |
+
|
| 40 |
+
# 5. Add the remote
|
| 41 |
+
git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
|
| 42 |
+
|
| 43 |
+
# 6. Add all files
|
| 44 |
+
# Since this is a fresh init, Git sees EVERY file as "New"
|
| 45 |
+
git add .
|
| 46 |
+
|
| 47 |
+
# 7. Commit and Force Push
|
| 48 |
+
git commit -m "Sync: $COMMIT_MSG"
|
| 49 |
+
git push --force hf main
|
| 50 |
+
env:
|
| 51 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 52 |
+
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 53 |
+
HF_EMAIL: ${{ secrets.HF_EMAIL }}
|
| 54 |
+
HF_REPO_ID: ${{ secrets.HF_REPO_ID }}
|
.github/workflows/sync_to_hf_zero_gpu.yml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub Zero GPU
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [dev]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
permissions:
|
| 8 |
+
contents: read
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
sync-to-hub-zero-gpu:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v6
|
| 15 |
+
with:
|
| 16 |
+
fetch-depth: 1 # Only get the latest state
|
| 17 |
+
lfs: true # Download actual LFS files so they can be pushed
|
| 18 |
+
|
| 19 |
+
- name: Install Git LFS
|
| 20 |
+
run: git lfs install
|
| 21 |
+
|
| 22 |
+
# HF Spaces read Space config from README.md front matter. The repo README
|
| 23 |
+
# targets GitHub (e.g. docker); patch only this CI checkout before HF push.
|
| 24 |
+
- name: Apply HF Zero GPU Space README front matter
|
| 25 |
+
run: python3 tools/apply_hf_zero_gpu_readme_frontmatter.py
|
| 26 |
+
|
| 27 |
+
- name: Recreate repo history (single-commit force push)
|
| 28 |
+
run: |
|
| 29 |
+
# 1. Capture the message BEFORE we delete the .git folder
|
| 30 |
+
COMMIT_MSG=$(git log -1 --pretty=%B)
|
| 31 |
+
echo "Syncing commit message: $COMMIT_MSG"
|
| 32 |
+
|
| 33 |
+
# 2. DELETE the .git folder.
|
| 34 |
+
# This turns the repo into a standard folder of files.
|
| 35 |
+
rm -rf .git
|
| 36 |
+
|
| 37 |
+
# 3. Re-initialize a brand new git repo
|
| 38 |
+
git init -b main
|
| 39 |
+
git config --global user.name "$HF_USERNAME"
|
| 40 |
+
git config --global user.email "$HF_EMAIL"
|
| 41 |
+
|
| 42 |
+
# 4. Re-install LFS (needs to be done after git init)
|
| 43 |
+
git lfs install
|
| 44 |
+
|
| 45 |
+
# 5. Add the remote
|
| 46 |
+
git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
|
| 47 |
+
|
| 48 |
+
# 6. Add all files
|
| 49 |
+
# Since this is a fresh init, Git sees EVERY file as "New"
|
| 50 |
+
git add .
|
| 51 |
+
|
| 52 |
+
# 7. Commit and Force Push
|
| 53 |
+
git commit -m "Sync: $COMMIT_MSG"
|
| 54 |
+
git push --force hf main
|
| 55 |
+
env:
|
| 56 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 57 |
+
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 58 |
+
HF_EMAIL: ${{ secrets.HF_EMAIL }}
|
| 59 |
+
HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}
|
.gitignore
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.url
|
| 2 |
+
*.ipynb
|
| 3 |
+
*.pyc
|
| 4 |
+
*.qmd
|
| 5 |
+
_quarto.yml
|
| 6 |
+
quarto_site/*
|
| 7 |
+
src/*
|
| 8 |
+
redaction_deps/*
|
| 9 |
+
.venv/*
|
| 10 |
+
examples/*
|
| 11 |
+
processing/*
|
| 12 |
+
input/*
|
| 13 |
+
output/*
|
| 14 |
+
tools/__pycache__/*
|
| 15 |
+
old_code/*
|
| 16 |
+
tesseract/*
|
| 17 |
+
poppler/*
|
| 18 |
+
build/*
|
| 19 |
+
dist/*
|
| 20 |
+
build_deps/*
|
| 21 |
+
logs/*
|
| 22 |
+
usage/*
|
| 23 |
+
feedback/*
|
| 24 |
+
config/*
|
| 25 |
+
workspace/*
|
| 26 |
+
user_guide/*
|
| 27 |
+
_extensions/*
|
| 28 |
+
doc_redaction.egg-info/*
|
| 29 |
+
.venv_pypi_test/*
|
| 30 |
+
cdk/config/*
|
| 31 |
+
cdk/cdk.out/*
|
| 32 |
+
cdk/archive/*
|
| 33 |
+
tld/*
|
| 34 |
+
tmp/*
|
| 35 |
+
docs/*
|
| 36 |
+
.pi/*
|
| 37 |
+
cdk.out/*
|
| 38 |
+
cdk.json
|
| 39 |
+
cdk.context.json
|
| 40 |
+
precheck.context.json
|
| 41 |
+
.quarto/*
|
| 42 |
+
/.quarto/
|
| 43 |
+
/_site/
|
| 44 |
+
test/config/*
|
| 45 |
+
test/feedback/*
|
| 46 |
+
test/input/*
|
| 47 |
+
test/logs/*
|
| 48 |
+
test/output/*
|
| 49 |
+
test/tmp/*
|
| 50 |
+
test/usage/*
|
| 51 |
+
.ruff_cache/*
|
| 52 |
+
model_cache/*
|
| 53 |
+
sanitized_file/*
|
| 54 |
+
src/doc_redaction.egg-info/*
|
| 55 |
+
docker_compose/*
|
| 56 |
+
**/*.quarto_ipynb
|
| 57 |
+
skills/example_prompts/*
|
| 58 |
+
.pi/sessions/
|
| 59 |
+
docker/pi/agent/sessions/
|
AGENTS.md
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AGENTS.md
|
| 2 |
+
|
| 3 |
+
Context for AI coding agents working on **doc_redaction** (PII redaction for PDFs, images, Word, and tabular files). Human-oriented docs: [README.md](README.md). User guide: [doc_redaction user guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
|
| 4 |
+
|
| 5 |
+
## Project overview
|
| 6 |
+
|
| 7 |
+
- **Stack**: Python 3.10+, Gradio UI ([app.py](app.py)), optional FastAPI when `RUN_FASTAPI` is enabled, AWS/LLM integrations via [tools/config.py](tools/config.py) and env files under `config/`.
|
| 8 |
+
- **License**: AGPL-3.0-only (see [pyproject.toml](pyproject.toml)). Respect license terms when adding dependencies.
|
| 9 |
+
- **Accuracy**: Outputs are not guaranteed complete; downstream use should assume **human review** of redacted material.
|
| 10 |
+
|
| 11 |
+
## Cursor skills: redaction workflow (optional)
|
| 12 |
+
|
| 13 |
+
For agents operating the deployed app (Gradio Client, review CSV, `/review_apply`), these repo-local playbooks are a suggested ladder:
|
| 14 |
+
|
| 15 |
+
0. **[`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md)** — copy-paste user task prompt (Pass 1 default, Pass 2 gated); **user redaction requirements go at the end of the prompt**.
|
| 16 |
+
1. **[`skills/doc-redaction-app/SKILL.md`](skills/doc-redaction-app/SKILL.md)** — first-pass redaction (`/doc_redact` / `/redact_document`) and downloading artifacts.
|
| 17 |
+
2. **[`skills/doc-redact-page-review/SKILL.md`](skills/doc-redact-page-review/SKILL.md)** — after outputs exist: **parallel per-page** child agents, merge into one full-document `*_review_file.csv`, **single** `/review_apply` from the parent.
|
| 18 |
+
3. **[`skills/doc-redaction-modifications/SKILL.md`](skills/doc-redaction-modifications/SKILL.md)** — CSV mechanics, `preview_redaction_boxes`, `/review_apply` patterns, verification, VLM and PyMuPDF fallbacks (single-thread edits and the **technical** reference for page-review children).
|
| 19 |
+
|
| 20 |
+
## Setup
|
| 21 |
+
|
| 22 |
+
1. **System**: Install **Tesseract** and **Poppler** (required for OCR/PDF). See [README.md](README.md) (Windows/Linux sections).
|
| 23 |
+
2. **Python**: Create a venv, then install the project (e.g. `pip install -e ".[dev]"` or follow README).
|
| 24 |
+
3. **Configuration**: Copy or edit environment/config as described in README / `config/` (e.g. `app_config.env`). Do not commit secrets.
|
| 25 |
+
|
| 26 |
+
## Run locally
|
| 27 |
+
|
| 28 |
+
- Gradio/FastAPI entrypoint is [app.py](app.py). With FastAPI enabled, typical pattern is `uvicorn app:app --host 0.0.0.0 --port 7860` (exact host/port from your config).
|
| 29 |
+
- OpenAPI docs: `/docs` when the FastAPI app is mounted.
|
| 30 |
+
|
| 31 |
+
## Tests
|
| 32 |
+
|
| 33 |
+
- Run from repo root: `pytest` (optional: `pytest test/`).
|
| 34 |
+
- Fix failures related to your changes before opening a PR.
|
| 35 |
+
|
| 36 |
+
## Line order (local OCR and simple text extraction)
|
| 37 |
+
|
| 38 |
+
Multi-column layouts use shared logic in [`tools/ocr_reading_order.py`](tools/ocr_reading_order.py). Controlled by **`LOCAL_OCR_READING_ORDER`** (`column` default, `legacy` for previous top-left behaviour).
|
| 39 |
+
|
| 40 |
+
### Local OCR (Paddle/Tesseract)
|
| 41 |
+
|
| 42 |
+
Word boxes are merged into line-level CSV rows in [`combine_ocr_results`](tools/custom_image_analyser_engine.py).
|
| 43 |
+
|
| 44 |
+
- **`column`**: detect text columns, assign line numbers down each column left-to-right; full-width lines (headers) first. Stops cross-column merging that produced wide erroneous lines on multi-column PDFs. **Auto-fallback**: the page is treated as single-column unless a *consecutive cluster* of gutter rows (y-gap between adjacent rows ≤ `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, default `0.06` of page height) has ≥ `OCR_COLUMN_MIN_GUTTER_ROWS` (default `3`) rows **and** the cluster's topmost row is above the footer zone (`OCR_COLUMN_FOOTER_ZONE_FRACTION`, default `0.75`). This prevents isolated header bands (logo | title, 1 gutter row), signature-only blocks at the page bottom (cluster starts at y ≥ 0.75), or the combination of both, from forcing column mode on the single-column body text between them.
|
| 45 |
+
- **`PADDLE_PRESERVE_LINE_BOXES=True`** or **`CONVERT_LINE_TO_WORD_LEVEL=False`** with Paddle: keep Paddle line boxes (skip word split + regrouping); line numbers still use column reading order.
|
| 46 |
+
|
| 47 |
+
### Simple text extraction (PyMuPDF)
|
| 48 |
+
|
| 49 |
+
[`redact_text_pdf`](tools/file_redaction.py) → [`process_page_to_structured_ocr_pymupdf`](tools/file_redaction.py) calls [`reorder_structured_text_lines`](tools/ocr_reading_order.py) after collecting lines, using **`page.mediabox`** width/height for full-span header detection.
|
| 50 |
+
|
| 51 |
+
`reorder_structured_text_lines` now mirrors `build_line_groups` (local OCR route):
|
| 52 |
+
|
| 53 |
+
1. **Column-aware sort** (`sort_reading_order` / `assign_layout_boxes` / `detect_column_split_xpoints`) — or legacy top-left for single-column pages.
|
| 54 |
+
2. **Y-band grouping** (`group_into_lines`) — merges any same-row PyMuPDF lines that were emitted as separate objects (e.g. mixed-font spans) and splits horizontally-disparate boxes via `_finalize_line`. *Column mode only.*
|
| 55 |
+
3. **Secondary sub-column pass** (`_reorder_lines_column_major`) — ensures correct column-major order when sub-columns sit within a single macro-column. *Column mode only.*
|
| 56 |
+
4. When a group contains more than one box, constituent boxes are **merged** into a single `OCRResult` (union bbox, joined text, concatenated chars/words).
|
| 57 |
+
|
| 58 |
+
In single-column / legacy mode only step 1 is applied; PyMuPDF lines are pre-formed so no merging is needed.
|
| 59 |
+
|
| 60 |
+
### Tunables (both routes)
|
| 61 |
+
|
| 62 |
+
`OCR_FULL_SPAN_WIDTH_RATIO`, `OCR_COLUMN_GAP_MIN_FRACTION`, `OCR_COLUMN_GUTTER_MIN_FRACTION`, `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` (default `0.015` — fine-grained gutter scan in `assign_layout_boxes`; lower = detects narrower sub-column boundaries), `OCR_COLUMN_MIN_GUTTER_ROWS`, `OCR_COLUMN_MAX_BOX_HEIGHT_RATIO`, `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, `OCR_COLUMN_FOOTER_ZONE_FRACTION`, `OCR_LINE_SPLIT_GAP_FRACTION` (default 0.025 — horizontal gap fraction that forces a line split; must be below the narrowest column gutter, ~0.030 for two-page spreads; also used as the gap threshold for the secondary sub-column sort in `build_line_groups`), `OCR_LINE_Y_THRESHOLD_FRACTION` (default 0.013 — row-alignment tolerance as a fraction of page height; reduced from 0.015 to correctly separate tightly-set 10 pt body text whose row spacing is ~0.014), `OCR_LINE_Y_THRESHOLD_MIN_PX`.
|
| 63 |
+
|
| 64 |
+
**Sub-column ordering** (`build_line_groups`): after the primary word-level column sort, a second pass (`_reorder_lines_column_major`) clusters the produced line groups by their leftmost x-position using `OCR_LINE_SPLIT_GAP_FRACTION` as the gap threshold. This ensures that adjacent narrow sub-columns whose word-level centre gap is below `column_gap_threshold` (e.g. two columns on a spread where each page is already one macro-column) are still output in left-to-right column-major order rather than interleaved by y-position.
|
| 65 |
+
|
| 66 |
+
**Fine-grained gutter-based column assignment** (`assign_layout_boxes`): before falling back to centre-gap clustering, `detect_column_split_xpoints` scans the page for structural gutters at the finer `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` threshold (default 0.015). Each qualifying gutter cluster produces a `(split_x, y_min)` pair — the split point is only applied to boxes whose `top ≥ y_min`, preventing a narrow sub-column gutter (visible only in the lower two-column section) from mis-splitting a full-width introductory paragraph that sits above it. This correctly separates narrow adjacent columns (e.g. 1.9 % gutter on a two-page spread) without fragmenting full-width headings or paragraphs.
|
| 67 |
+
|
| 68 |
+
Changing line order affects PII page text, duplicate-page detection, and review CSV line indices on multi-column documents; re-review after upgrading.
|
| 69 |
+
|
| 70 |
+
## Agentic / programmatic access (two surfaces)
|
| 71 |
+
|
| 72 |
+
### 1. FastAPI Agent API (recommended for LLM agents: small JSON bodies)
|
| 73 |
+
|
| 74 |
+
When `RUN_FASTAPI` is true, routes are mounted under **`/agent`** ([agent_routes.py](agent_routes.py)).
|
| 75 |
+
|
| 76 |
+
- **Catalog**: `GET /agent/operations` — maps each Gradio `api_name` to an HTTP path and notes whether the route is implemented via CLI or returns HTTP 501 for Gradio-only flows.
|
| 77 |
+
- **Implemented POST routes** (CLI- or [tools/simplified_api.py](tools/simplified_api.py)-backed where noted):
|
| 78 |
+
`redact_document`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_pdfs`, `combine_review_csvs`, `export_review_redaction_overlay`, `export_review_page_ocr_visualisation`, `apply_review_redactions`, **`verify_redaction_coverage`** (Pass 1 QA: `must_redact` / `must_not_redact` regex lists, optional `redacted_pdf_path`, optional `auto_prune_suspicious` + `pruned_output_path`; returns `pass_strict`, `pass_with_cleanup`, `pages_flagged_for_vlm`, `pages_needing_csv_cleanup`), **`word_level_ocr_text_search`** (headless word OCR search with optional review-box overlap flags).
|
| 79 |
+
|
| 80 |
+
**Optional post-redaction Pass 1 QA (main app / CLI):** When `POST_REDACT_PASS1_QA=True` in [`tools/config.py`](tools/config.py) (or `config/app_config.env`), initial redaction emits `*_coverage_report.json` beside the review CSV and optionally `*_review_file_pruned.csv` (sibling, when `POST_REDACT_PASS1_AUTO_PRUNE=True`). Uses deny/allow lists and/or `POST_REDACT_PASS1_MUST_REDACT_PATH` / `POST_REDACT_PASS1_MUST_NOT_REDACT_PATH`. CLI overrides: `--post-redact-pass1-qa`, `--post-redact-pass1-auto-prune`. This is pre-review-apply sanity QA only — agent Pass 1 (policy edits + `/review_apply`) remains separate.
|
| 81 |
+
Note: on Gradio ([app.py](app.py)), the Review-tab visual exports use `api_name` **`page_redaction_review_image`** and **`page_ocr_review_image`**; the **`/agent`** routes above keep the explicit `export_review_*` names for the same operations.
|
| 82 |
+
- **Gradio-only stubs** (501 + JSON hint): `load_and_prepare_documents_or_data`.
|
| 83 |
+
- **Auth**: If `AGENT_API_KEY` is set in the environment, send header `X-Agent-API-Key` with that value.
|
| 84 |
+
- **Paths**: Inputs must resolve to files under the repo root, `INPUT_FOLDER`, or `OUTPUT_FOLDER` (see router validation).
|
| 85 |
+
|
| 86 |
+
Implementation uses **`cli_redact.main(direct_mode_args=...)`** where a CLI task exists (same behaviour as [cli_redact.py](cli_redact.py)); `apply_review_redactions` calls [tools/simplified_api.py](tools/simplified_api.py) instead.
|
| 87 |
+
|
| 88 |
+
### 2. Gradio Client API (e.g. Hugging Face Spaces)
|
| 89 |
+
|
| 90 |
+
For remote Spaces or any Gradio deployment exposing the HTTP API:
|
| 91 |
+
|
| 92 |
+
- **Schema**: `GET https://<host>/gradio_api/info`
|
| 93 |
+
- **Call**: `POST https://<host>/gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order matches the named endpoint’s component list).
|
| 94 |
+
- **Poll**: `GET https://<host>/gradio_api/call/{api_name}/{event_id}`
|
| 95 |
+
- **Hugging Face**: `Authorization: Bearer $HF_TOKEN`
|
| 96 |
+
|
| 97 |
+
Named `api_name` values in this app include: `redact_document`, `load_and_prepare_documents_or_data`, `apply_review_redactions`, **`doc_redact`** (simple `gr.api`: one PDF/image + optional OCR/PII knobs; returns `(output_paths, message)`; `api_name='/doc_redact'`; parameters include `document_file`, `redact_entities`, `output_dir`, `ocr_method`, `pii_method`, `allow_list`, `deny_list`, `page_min`, `page_max`, **`handwrite_signature_checkbox`** — AWS Textract extraction options such as `Extract handwriting` / `Extract signatures`), **`review_apply`** (simple `gr.api`: PDF + `*_review_file.csv`; returns `(output_paths, message)`; `api_name='/review_apply'`), **`preview_boxes`** (simple `gr.api`: PDF + `*_review_file.csv`; renders proposed boxes onto the original PDF and returns `(zip_path, message)` — use to verify coordinates *before* calling `review_apply`, no redaction applied; `api_name='/preview_boxes'`), **`pdf_summarise`** (simple `gr.api`: PDF + optional summarisation/OCR knobs; returns `(output_paths, status_message, summary_text)`; `api_name='/pdf_summarise'`), **`tabular_redact`** (simple `gr.api`: one tabular file (CSV/XLSX/Parquet/DOCX) + optional knobs; returns `(output_paths, message)`; `api_name='/tabular_redact'`), **`page_redaction_review_image`** (short review overlay export; `api_name='/page_redaction_review_image'`), **`page_ocr_review_image`** (short OCR visualisation export; `api_name='/page_ocr_review_image'`), `word_level_ocr_text_search`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_csvs`, `combine_review_pdfs`. The matching **`POST /agent`** names for those two visual exports are `export_review_redaction_overlay` and `export_review_page_ocr_visualisation` (§1). Many endpoints require **many positional arguments** (full Gradio state); prefer the short `gr.api` routes above or **`POST /agent/apply_review_redactions`** where applicable instead of building the full `data` array from `/gradio_api/info`.
|
| 98 |
+
|
| 99 |
+
## CLI parity
|
| 100 |
+
|
| 101 |
+
For scripting and tests, `python cli_redact.py` with flags is authoritative; programmatic merges use `get_cli_default_args_dict()` in [cli_redact.py](cli_redact.py).
|
| 102 |
+
|
| 103 |
+
## Security and data handling
|
| 104 |
+
|
| 105 |
+
- Do not commit API keys, tokens, or customer data.
|
| 106 |
+
- Treat paths as untrusted outside validated roots (see [tools/secure_path_utils.py](tools/secure_path_utils.py)).
|
| 107 |
+
- Optional `instruction` / LLM fields must not be passed into shell or unconstrained config keys.
|
| 108 |
+
|
| 109 |
+
## Conventions for PRs
|
| 110 |
+
|
| 111 |
+
- Keep changes focused; avoid drive-by refactors.
|
| 112 |
+
- Match existing naming and patterns in [app.py](app.py) and [tools/](tools/).
|
| 113 |
+
- Update tests when behaviour changes; run `pytest` before merge.
|
Dockerfile
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Stage 1: Build dependencies and download models
|
| 2 |
+
FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
|
| 3 |
+
|
| 4 |
+
# Install system dependencies
|
| 5 |
+
RUN apt-get update \
|
| 6 |
+
&& apt-get upgrade -y \
|
| 7 |
+
&& apt-get install -y --no-install-recommends \
|
| 8 |
+
g++ \
|
| 9 |
+
make \
|
| 10 |
+
cmake \
|
| 11 |
+
unzip \
|
| 12 |
+
libcurl4-openssl-dev \
|
| 13 |
+
git \
|
| 14 |
+
&& pip install --upgrade pip \
|
| 15 |
+
&& apt-get clean \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
WORKDIR /src
|
| 19 |
+
|
| 20 |
+
COPY requirements_lightweight.txt .
|
| 21 |
+
|
| 22 |
+
RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
|
| 23 |
+
|
| 24 |
+
ARG INSTALL_GRADIO_MCP=False
|
| 25 |
+
ENV INSTALL_GRADIO_MCP=${INSTALL_GRADIO_MCP}
|
| 26 |
+
|
| 27 |
+
RUN if [ "$INSTALL_GRADIO_MCP" = "True" ]; then \
|
| 28 |
+
pip install --verbose --no-cache-dir --force-reinstall --target=/install "gradio[mcp]<=6.10.0"; \
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
|
| 32 |
+
|
| 33 |
+
ARG INSTALL_PADDLEOCR=False
|
| 34 |
+
ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
|
| 35 |
+
|
| 36 |
+
ARG PADDLE_GPU_ENABLED=False
|
| 37 |
+
ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
|
| 38 |
+
|
| 39 |
+
RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
|
| 40 |
+
pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
|
| 41 |
+
pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
|
| 42 |
+
pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
|
| 43 |
+
elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
|
| 44 |
+
pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
|
| 45 |
+
pip install --verbose --no-cache-dir --target=/install "paddlepaddle-gpu<=3.2.1" --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ && \
|
| 46 |
+
pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
|
| 47 |
+
fi
|
| 48 |
+
|
| 49 |
+
ARG INSTALL_VLM=False
|
| 50 |
+
ENV INSTALL_VLM=${INSTALL_VLM}
|
| 51 |
+
|
| 52 |
+
ARG TORCH_GPU_ENABLED=False
|
| 53 |
+
ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
|
| 54 |
+
|
| 55 |
+
# Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
|
| 56 |
+
RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
|
| 57 |
+
pip install --verbose --no-cache-dir --target=/install \
|
| 58 |
+
"torch==2.9.1+cpu" \
|
| 59 |
+
"torchvision==0.24.1+cpu" \
|
| 60 |
+
"transformers<=5.5.4" \
|
| 61 |
+
"accelerate<=1.13.0" \
|
| 62 |
+
"bitsandbytes<=0.49.2" \
|
| 63 |
+
"sentencepiece<=0.2.1" \
|
| 64 |
+
--extra-index-url https://download.pytorch.org/whl/cpu; \
|
| 65 |
+
elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
|
| 66 |
+
pip install --verbose --no-cache-dir --target=/install "torch<=2.8.0" --index-url https://download.pytorch.org/whl/cu129 && \
|
| 67 |
+
pip install --verbose --no-cache-dir --target=/install "torchvision<=0.23.0" --index-url https://download.pytorch.org/whl/cu129 && \
|
| 68 |
+
pip install --verbose --no-cache-dir --target=/install \
|
| 69 |
+
"transformers<=5.5.4" \
|
| 70 |
+
"accelerate<=1.13.0" \
|
| 71 |
+
"bitsandbytes<=0.49.2" \
|
| 72 |
+
"sentencepiece<=0.2.1" && \
|
| 73 |
+
pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
|
| 74 |
+
pip install --verbose --no-cache-dir --target=/install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
|
| 75 |
+
pip install --verbose --no-cache-dir --target=/install https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
# ===================================================================
|
| 79 |
+
# Stage 2: A common base for both Lambda and Gradio
|
| 80 |
+
# ===================================================================
|
| 81 |
+
FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
|
| 82 |
+
|
| 83 |
+
# MUST re-declare ARGs in every stage where they are used in RUN commands
|
| 84 |
+
ARG TORCH_GPU_ENABLED=False
|
| 85 |
+
ARG PADDLE_GPU_ENABLED=False
|
| 86 |
+
|
| 87 |
+
ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
|
| 88 |
+
ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
|
| 89 |
+
|
| 90 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 91 |
+
tesseract-ocr \
|
| 92 |
+
poppler-utils \
|
| 93 |
+
libgl1 \
|
| 94 |
+
libglib2.0-0 && \
|
| 95 |
+
if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
|
| 96 |
+
apt-get install -y --no-install-recommends libgomp1; \
|
| 97 |
+
fi && \
|
| 98 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 99 |
+
|
| 100 |
+
ENV APP_HOME=/home/user
|
| 101 |
+
|
| 102 |
+
# Set env variables for Gradio & other apps
|
| 103 |
+
ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
| 104 |
+
MPLCONFIGDIR=/tmp/matplotlib_cache/ \
|
| 105 |
+
GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
|
| 106 |
+
GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
|
| 107 |
+
FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
|
| 108 |
+
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
| 109 |
+
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
| 110 |
+
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
| 111 |
+
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
|
| 112 |
+
TESSERACT_DATA_FOLDER=/usr/share/tessdata \
|
| 113 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 114 |
+
GRADIO_SERVER_PORT=7860 \
|
| 115 |
+
PATH=$APP_HOME/.local/bin:$PATH \
|
| 116 |
+
PYTHONPATH=$APP_HOME/app \
|
| 117 |
+
PYTHONUNBUFFERED=1 \
|
| 118 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 119 |
+
GRADIO_ALLOW_FLAGGING=never \
|
| 120 |
+
GRADIO_NUM_PORTS=1 \
|
| 121 |
+
GRADIO_ANALYTICS_ENABLED=False
|
| 122 |
+
|
| 123 |
+
# Copy Python packages from the builder stage
|
| 124 |
+
COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
|
| 125 |
+
COPY --from=builder /install/bin /usr/local/bin/
|
| 126 |
+
|
| 127 |
+
# Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
|
| 128 |
+
# passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
|
| 129 |
+
RUN pip install --no-cache-dir "protobuf<=7.34.0"
|
| 130 |
+
|
| 131 |
+
# English pipeline is not a normal PyPI dependency; bundle it in the image so runtime works offline.
|
| 132 |
+
# Placed before COPY app code so application changes do not invalidate this layer.
|
| 133 |
+
RUN python -m spacy download en_core_web_lg
|
| 134 |
+
|
| 135 |
+
# Copy your application code and entrypoint
|
| 136 |
+
COPY . ${APP_HOME}/app
|
| 137 |
+
COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
|
| 138 |
+
# Fix line endings and set execute permissions
|
| 139 |
+
RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
|
| 140 |
+
&& chmod +x ${APP_HOME}/app/entrypoint.sh
|
| 141 |
+
|
| 142 |
+
WORKDIR ${APP_HOME}/app
|
| 143 |
+
|
| 144 |
+
# ===================================================================
|
| 145 |
+
# FINAL Stage 3: The Lambda Image (runs as root for simplicity)
|
| 146 |
+
# ===================================================================
|
| 147 |
+
FROM base AS lambda
|
| 148 |
+
# Set runtime ENV for Lambda mode
|
| 149 |
+
ENV APP_MODE=lambda
|
| 150 |
+
ENTRYPOINT ["/home/user/app/entrypoint.sh"]
|
| 151 |
+
CMD ["lambda_entrypoint.lambda_handler"]
|
| 152 |
+
|
| 153 |
+
# ===================================================================
|
| 154 |
+
# FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
|
| 155 |
+
# ===================================================================
|
| 156 |
+
FROM base AS gradio
|
| 157 |
+
# Set runtime ENV for Gradio mode
|
| 158 |
+
ENV APP_MODE=gradio
|
| 159 |
+
|
| 160 |
+
# Create non-root user
|
| 161 |
+
RUN useradd -m -u 1000 user
|
| 162 |
+
|
| 163 |
+
# Create the base application directory and set its ownership
|
| 164 |
+
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
| 165 |
+
|
| 166 |
+
# Create required sub-folders within the app directory and set their permissions
|
| 167 |
+
# This ensures these specific directories are owned by 'user'
|
| 168 |
+
RUN mkdir -p \
|
| 169 |
+
${APP_HOME}/app/output \
|
| 170 |
+
${APP_HOME}/app/input \
|
| 171 |
+
${APP_HOME}/app/logs \
|
| 172 |
+
${APP_HOME}/app/usage \
|
| 173 |
+
${APP_HOME}/app/feedback \
|
| 174 |
+
${APP_HOME}/app/config \
|
| 175 |
+
&& chown user:user \
|
| 176 |
+
${APP_HOME}/app/output \
|
| 177 |
+
${APP_HOME}/app/input \
|
| 178 |
+
${APP_HOME}/app/logs \
|
| 179 |
+
${APP_HOME}/app/usage \
|
| 180 |
+
${APP_HOME}/app/feedback \
|
| 181 |
+
${APP_HOME}/app/config \
|
| 182 |
+
&& chmod 755 \
|
| 183 |
+
${APP_HOME}/app/output \
|
| 184 |
+
${APP_HOME}/app/input \
|
| 185 |
+
${APP_HOME}/app/logs \
|
| 186 |
+
${APP_HOME}/app/usage \
|
| 187 |
+
${APP_HOME}/app/feedback \
|
| 188 |
+
${APP_HOME}/app/config
|
| 189 |
+
|
| 190 |
+
# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
|
| 191 |
+
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
| 192 |
+
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
| 193 |
+
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
| 194 |
+
&& chmod 700 ${XDG_CACHE_HOME} \
|
| 195 |
+
&& mkdir -p ${APP_HOME}/.paddlex \
|
| 196 |
+
&& chown user:user ${APP_HOME}/.paddlex \
|
| 197 |
+
&& chmod 755 ${APP_HOME}/.paddlex \
|
| 198 |
+
&& mkdir -p ${APP_HOME}/.local/share/spacy/data \
|
| 199 |
+
&& chown user:user ${APP_HOME}/.local/share/spacy/data \
|
| 200 |
+
&& chmod 755 ${APP_HOME}/.local/share/spacy/data \
|
| 201 |
+
&& mkdir -p /usr/share/tessdata \
|
| 202 |
+
&& chown user:user /usr/share/tessdata \
|
| 203 |
+
&& chmod 755 /usr/share/tessdata
|
| 204 |
+
|
| 205 |
+
# Fix apply user ownership to all files in the home directory
|
| 206 |
+
RUN chown -R user:user /home/user
|
| 207 |
+
|
| 208 |
+
# Set permissions for Python executable
|
| 209 |
+
RUN chmod 755 /usr/local/bin/python
|
| 210 |
+
|
| 211 |
+
# Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
|
| 212 |
+
VOLUME ["/tmp/matplotlib_cache"]
|
| 213 |
+
VOLUME ["/tmp/gradio_tmp"]
|
| 214 |
+
VOLUME ["/tmp/tld"]
|
| 215 |
+
VOLUME ["/home/user/app/output"]
|
| 216 |
+
VOLUME ["/home/user/app/input"]
|
| 217 |
+
VOLUME ["/home/user/app/logs"]
|
| 218 |
+
VOLUME ["/home/user/app/usage"]
|
| 219 |
+
VOLUME ["/home/user/app/feedback"]
|
| 220 |
+
VOLUME ["/home/user/app/config"]
|
| 221 |
+
VOLUME ["/home/user/.paddlex"]
|
| 222 |
+
VOLUME ["/home/user/.local/share/spacy/data"]
|
| 223 |
+
VOLUME ["/usr/share/tessdata"]
|
| 224 |
+
VOLUME ["/tmp"]
|
| 225 |
+
VOLUME ["/var/tmp"]
|
| 226 |
+
|
| 227 |
+
USER user
|
| 228 |
+
|
| 229 |
+
EXPOSE $GRADIO_SERVER_PORT
|
| 230 |
+
|
| 231 |
+
ENTRYPOINT ["/home/user/app/entrypoint.sh"]
|
| 232 |
+
CMD ["python", "app.py"]
|
Dockerfile.pi
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1
|
| 2 |
+
|
| 3 |
+
FROM node:22-bookworm-slim
|
| 4 |
+
|
| 5 |
+
ENV NODE_ENV=production
|
| 6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
+
ENV NPM_CONFIG_LOGLEVEL=warn
|
| 8 |
+
ENV PYTHONUNBUFFERED=1
|
| 9 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 10 |
+
ENV PYTHONPATH=/workspace/doc_redaction
|
| 11 |
+
|
| 12 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 13 |
+
bash \
|
| 14 |
+
git \
|
| 15 |
+
curl \
|
| 16 |
+
ca-certificates \
|
| 17 |
+
procps \
|
| 18 |
+
python3 \
|
| 19 |
+
python3-pip \
|
| 20 |
+
python3-venv \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
|
| 24 |
+
|
| 25 |
+
COPY requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
|
| 26 |
+
RUN pip3 install --no-cache-dir --break-system-packages \
|
| 27 |
+
-r /tmp/requirements_pi_agent.txt \
|
| 28 |
+
&& rm /tmp/requirements_pi_agent.txt
|
| 29 |
+
|
| 30 |
+
RUN mkdir -p /home/node/.pi/agent/sessions /workspace/doc_redaction \
|
| 31 |
+
&& chown -R node:node /home/node/.pi /workspace
|
| 32 |
+
|
| 33 |
+
WORKDIR /workspace/doc_redaction
|
| 34 |
+
|
| 35 |
+
USER node
|
| 36 |
+
|
| 37 |
+
RUN pi --version
|
| 38 |
+
|
| 39 |
+
ENTRYPOINT ["pi"]
|
| 40 |
+
CMD []
|
MANIFEST.in
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
recursive-include doc_redaction/assets *.png
|
| 2 |
+
recursive-include doc_redaction/example_data *
|
| 3 |
+
recursive-include intros *.txt
|
| 4 |
+
|
README.md
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Document redaction
|
| 3 |
+
emoji: 📝
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
+
license: agpl-3.0
|
| 10 |
+
short_description: OCR / redact PDF documents and tabular data
|
| 11 |
+
---
|
| 12 |
+
# Document redaction (doc_redaction)
|
| 13 |
+
|
| 14 |
+
<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
|
| 15 |
+
|
| 16 |
+
Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## 🚀 Quick Start - Installation and first run
|
| 21 |
+
|
| 22 |
+
Follow these instructions to get the document redaction application running on your local machine.
|
| 23 |
+
|
| 24 |
+
### 1. Package installation
|
| 25 |
+
|
| 26 |
+
#### Option 1 - Recommended: Install from source repo
|
| 27 |
+
|
| 28 |
+
Clone the repository and install in editable mode:
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
git clone https://github.com/seanpedrick-case/doc_redaction.git
|
| 32 |
+
cd doc_redaction
|
| 33 |
+
pip install -e .
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
##### Install extras (Paddle or Transformers/Torch VLM)
|
| 37 |
+
|
| 38 |
+
To install with PaddleOCR:
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
pip install -e ".[paddle]"
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
|
| 45 |
+
```bash
|
| 46 |
+
pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
If you want to run VLMs / LLMs with the transformers package:
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
pip install -e ".[vlm]"
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
|
| 60 |
+
pip install torchvision --index-url https://download.pytorch.org/whl/cu129
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
#### Option 2 - Install from PyPI
|
| 64 |
+
|
| 65 |
+
Create a virtual environment (recommended) and install **doc_redaction**.
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
python -m venv venv
|
| 69 |
+
# Windows:
|
| 70 |
+
.\venv\Scripts\activate
|
| 71 |
+
# macOS/Linux:
|
| 72 |
+
source venv/bin/activate
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
pip install doc_redaction
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
pip install "doc_redaction[paddle]"
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
For running VLMs / LLMs with the transformers package:
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
pip install "doc_redaction[vlm]"
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
|
| 94 |
+
|
| 95 |
+
**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
python -m app
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
|
| 102 |
+
|
| 103 |
+
- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
|
| 104 |
+
- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
|
| 105 |
+
- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
|
| 106 |
+
|
| 107 |
+
In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
|
| 108 |
+
|
| 109 |
+
#### Option 3 - Docker installation
|
| 110 |
+
|
| 111 |
+
The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
|
| 112 |
+
|
| 113 |
+
##### With Llama.cpp / vLLM inference server
|
| 114 |
+
|
| 115 |
+
The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
|
| 116 |
+
|
| 117 |
+
For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
|
| 118 |
+
|
| 119 |
+
You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
|
| 120 |
+
|
| 121 |
+
##### Without Llama.cpp / vLLM inference server
|
| 122 |
+
|
| 123 |
+
If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
|
| 124 |
+
|
| 125 |
+
### 2. Install prerequisites: Tesseract and Poppler
|
| 126 |
+
|
| 127 |
+
This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding. To run the Document Redaction app successfully, these tools need to be installed and either 1. added to PATH, or 2. be in a folder that is directly referenced in the config/app_config.env file with the variables TESSERACT_FOLDER and POPPLER_FOLDER (defined [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py) if you want to see the code). The instructions below will guide you through diffferent ways to install these dependencies.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
#### Automated dependency setup (recommended)
|
| 132 |
+
|
| 133 |
+
If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
|
| 134 |
+
|
| 135 |
+
You need the installer script available first, which means either:
|
| 136 |
+
|
| 137 |
+
- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
|
| 138 |
+
- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
|
| 139 |
+
|
| 140 |
+
From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
python -m doc_redaction.install_deps
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
|
| 147 |
+
|
| 148 |
+
To just check whether your machine can already see the tools:
|
| 149 |
+
|
| 150 |
+
```bash
|
| 151 |
+
python -m doc_redaction.install_deps --verify-only
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
#### **On Windows**
|
| 155 |
+
|
| 156 |
+
If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
|
| 157 |
+
|
| 158 |
+
1. **Install Tesseract OCR:**
|
| 159 |
+
* Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
|
| 160 |
+
* Run the installer.
|
| 161 |
+
* **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
2. **Install Poppler:**
|
| 165 |
+
* Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
|
| 166 |
+
* Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
|
| 167 |
+
* You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
|
| 168 |
+
* Search for "Edit the system environment variables" in the Windows Start Menu and open it.
|
| 169 |
+
* Click the "Environment Variables..." button.
|
| 170 |
+
* In the "System variables" section, find and select the `Path` variable, then click "Edit...".
|
| 171 |
+
* Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
|
| 172 |
+
* Click OK on all windows to save the changes.
|
| 173 |
+
|
| 174 |
+
To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
#### **On Linux (Debian/Ubuntu)**
|
| 178 |
+
|
| 179 |
+
Open your terminal and run the following command to install Tesseract and Poppler:
|
| 180 |
+
|
| 181 |
+
```bash
|
| 182 |
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
#### **On Linux (Fedora/CentOS/RHEL)**
|
| 186 |
+
|
| 187 |
+
Open your terminal and use the `dnf` or `yum` package manager:
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
sudo dnf install -y tesseract poppler-utils
|
| 191 |
+
```
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
### 3. Run the Application
|
| 195 |
+
|
| 196 |
+
With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
python app.py
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
|
| 203 |
+
|
| 204 |
+
Open this URL in your web browser to use the document redaction tool
|
| 205 |
+
|
| 206 |
+
#### Command line interface
|
| 207 |
+
|
| 208 |
+
For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
|
| 209 |
+
|
| 210 |
+
If you installed from **PyPI**, use the installed console script:
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
cli_redact --help
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
From a **repository checkout**, you can also run:
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
python cli_redact.py --help
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
#### Python package commands
|
| 223 |
+
|
| 224 |
+
For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
### 4. ⚙️ Configuration (Optional)
|
| 230 |
+
|
| 231 |
+
You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
|
| 232 |
+
|
| 233 |
+
To get started:
|
| 234 |
+
1. Locate the `example_config.env` file in the root of the project.
|
| 235 |
+
2. Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
|
| 236 |
+
3. Copy the contents from `example_config.env` into your new `config/app_config.env` file.
|
| 237 |
+
4. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
|
| 238 |
+
|
| 239 |
+
If you do not create this file, the application will run with default settings.
|
| 240 |
+
|
| 241 |
+
#### Configuration Breakdown
|
| 242 |
+
|
| 243 |
+
Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
#### **Local & General Settings (No AWS Required)**
|
| 248 |
+
|
| 249 |
+
These settings are useful for all users, regardless of whether you are using AWS.
|
| 250 |
+
|
| 251 |
+
* `TESSERACT_FOLDER` / `POPPLER_FOLDER`
|
| 252 |
+
* Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
|
| 253 |
+
* Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
|
| 254 |
+
* **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
|
| 255 |
+
|
| 256 |
+
* `TESSERACT_DATA_FOLDER`
|
| 257 |
+
* If Tesseract runs but you see an error like `Error opening data file ./eng.traineddata` or `Tesseract couldn't load any languages`, this is usually because it can't find the `tessdata/` language files.
|
| 258 |
+
* Set this to the folder that contains `eng.traineddata` (typically a `tessdata` directory).
|
| 259 |
+
* **Examples (Windows):** `TESSERACT_DATA_FOLDER=C:/Program Files/Tesseract-OCR/tessdata`
|
| 260 |
+
|
| 261 |
+
* `SHOW_LANGUAGE_SELECTION=True`
|
| 262 |
+
* Set to `True` to display a language selection dropdown in the UI for OCR processing.
|
| 263 |
+
|
| 264 |
+
* `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
|
| 265 |
+
* Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
|
| 266 |
+
|
| 267 |
+
* `SESSION_OUTPUT_FOLDER=False`
|
| 268 |
+
* If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
|
| 269 |
+
|
| 270 |
+
* `DISPLAY_FILE_NAMES_IN_LOGS=False`
|
| 271 |
+
* For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
|
| 272 |
+
|
| 273 |
+
---
|
| 274 |
+
|
| 275 |
+
#### **AWS-Specific Settings**
|
| 276 |
+
|
| 277 |
+
These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
|
| 278 |
+
|
| 279 |
+
* `RUN_AWS_FUNCTIONS=True`
|
| 280 |
+
* **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
|
| 281 |
+
|
| 282 |
+
* **UI Options:**
|
| 283 |
+
* `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
|
| 284 |
+
* `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
|
| 285 |
+
|
| 286 |
+
* **Core AWS Configuration:**
|
| 287 |
+
* `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
|
| 288 |
+
* `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
|
| 289 |
+
|
| 290 |
+
* **AWS Logging:**
|
| 291 |
+
* `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
|
| 292 |
+
* `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
|
| 293 |
+
|
| 294 |
+
* **Advanced AWS Textract Features:**
|
| 295 |
+
* `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
|
| 296 |
+
* `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
|
| 297 |
+
* `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
|
| 298 |
+
|
| 299 |
+
* **Cost Tracking (for internal accounting):**
|
| 300 |
+
* `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
|
| 301 |
+
* `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
|
| 302 |
+
* `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
|
| 303 |
+
* `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
|
| 304 |
+
|
| 305 |
+
Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
|
| 306 |
+
|
| 307 |
+
## For agents (API quickstart)
|
| 308 |
+
|
| 309 |
+
If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
|
| 310 |
+
|
| 311 |
+
- **Discover schema**: `GET /gradio_api/info`
|
| 312 |
+
- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
|
| 313 |
+
- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
|
| 314 |
+
- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
|
| 315 |
+
- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
|
| 316 |
+
|
| 317 |
+
### Choose the correct route (prefer short `gr.api` endpoints)
|
| 318 |
+
|
| 319 |
+
Fetch `/gradio_api/info` and then prefer the simplest route that exists:
|
| 320 |
+
|
| 321 |
+
- **Apply edited review CSV to a PDF**: `/review_apply`
|
| 322 |
+
- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
|
| 323 |
+
- **Summarise a PDF**: `/pdf_summarise`
|
| 324 |
+
- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
|
| 325 |
+
|
| 326 |
+
If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
|
| 327 |
+
|
| 328 |
+
### Common gotchas
|
| 329 |
+
|
| 330 |
+
- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
|
| 331 |
+
- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
|
| 332 |
+
- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
|
| 333 |
+
|
| 334 |
+
### Optional: MCP server
|
| 335 |
+
|
| 336 |
+
If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
|
| 337 |
+
|
| 338 |
+
**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
|
| 339 |
+
|
| 340 |
+
To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
|
| 341 |
+
|
| 342 |
+
For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
|
| 343 |
+
|
| 344 |
+
Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
|
| 345 |
+
|
| 346 |
+
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
|
README_PYPI.md
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Document redaction (doc_redaction)
|
| 2 |
+
|
| 3 |
+
<a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
|
| 4 |
+
|
| 5 |
+
Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 🚀 Quick Start - Installation and first run
|
| 10 |
+
|
| 11 |
+
Follow these instructions to get the document redaction application running on your local machine.
|
| 12 |
+
|
| 13 |
+
### 1. Package installation
|
| 14 |
+
|
| 15 |
+
#### Option 1 - Recommended: Install from source repo
|
| 16 |
+
|
| 17 |
+
Clone the repository and install in editable mode:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
git clone https://github.com/seanpedrick-case/doc_redaction.git
|
| 21 |
+
cd doc_redaction
|
| 22 |
+
pip install -e .
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
##### Install extras (Paddle or Transformers/Torch VLM)
|
| 26 |
+
|
| 27 |
+
To install with PaddleOCR:
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
pip install -e ".[paddle]"
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
|
| 34 |
+
```bash
|
| 35 |
+
pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
If you want to run VLMs / LLMs with the transformers package:
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
pip install -e ".[vlm]"
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
**Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
|
| 49 |
+
pip install torchvision --index-url https://download.pytorch.org/whl/cu129
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
#### Option 2 - Install from PyPI
|
| 53 |
+
|
| 54 |
+
Create a virtual environment (recommended) and install **doc_redaction**.
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
python -m venv venv
|
| 58 |
+
# Windows:
|
| 59 |
+
.\venv\Scripts\activate
|
| 60 |
+
# macOS/Linux:
|
| 61 |
+
source venv/bin/activate
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install doc_redaction
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
pip install "doc_redaction[paddle]"
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
For running VLMs / LLMs with the transformers package:
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
pip install "doc_redaction[vlm]"
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
|
| 83 |
+
|
| 84 |
+
**Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
python -m app
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
**Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
|
| 91 |
+
|
| 92 |
+
- It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
|
| 93 |
+
- It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
|
| 94 |
+
- The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
|
| 95 |
+
|
| 96 |
+
In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
|
| 97 |
+
|
| 98 |
+
#### Option 3 - Docker installation
|
| 99 |
+
|
| 100 |
+
The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
|
| 101 |
+
|
| 102 |
+
##### With Llama.cpp / vLLM inference server
|
| 103 |
+
|
| 104 |
+
The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
|
| 105 |
+
|
| 106 |
+
For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
|
| 107 |
+
|
| 108 |
+
You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
|
| 109 |
+
|
| 110 |
+
##### Without Llama.cpp / vLLM inference server
|
| 111 |
+
|
| 112 |
+
If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
|
| 113 |
+
|
| 114 |
+
### 2. Install prerequisites: Tesseract and Poppler
|
| 115 |
+
|
| 116 |
+
This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
#### Automated dependency setup (recommended)
|
| 121 |
+
|
| 122 |
+
If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
|
| 123 |
+
|
| 124 |
+
You need the installer script available first, which means either:
|
| 125 |
+
|
| 126 |
+
- **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
|
| 127 |
+
- **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
|
| 128 |
+
|
| 129 |
+
From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
python -m doc_redaction.install_deps
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
|
| 136 |
+
|
| 137 |
+
To just check whether your machine can already see the tools:
|
| 138 |
+
|
| 139 |
+
```bash
|
| 140 |
+
python -m doc_redaction.install_deps --verify-only
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
#### **On Windows**
|
| 144 |
+
|
| 145 |
+
If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
|
| 146 |
+
|
| 147 |
+
1. **Install Tesseract OCR:**
|
| 148 |
+
* Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
|
| 149 |
+
* Run the installer.
|
| 150 |
+
* **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
2. **Install Poppler:**
|
| 154 |
+
* Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
|
| 155 |
+
* Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
|
| 156 |
+
* You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
|
| 157 |
+
* Search for "Edit the system environment variables" in the Windows Start Menu and open it.
|
| 158 |
+
* Click the "Environment Variables..." button.
|
| 159 |
+
* In the "System variables" section, find and select the `Path` variable, then click "Edit...".
|
| 160 |
+
* Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
|
| 161 |
+
* Click OK on all windows to save the changes.
|
| 162 |
+
|
| 163 |
+
To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
|
| 164 |
+
---
|
| 165 |
+
|
| 166 |
+
#### **On Linux (Debian/Ubuntu)**
|
| 167 |
+
|
| 168 |
+
Open your terminal and run the following command to install Tesseract and Poppler:
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
#### **On Linux (Fedora/CentOS/RHEL)**
|
| 175 |
+
|
| 176 |
+
Open your terminal and use the `dnf` or `yum` package manager:
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
sudo dnf install -y tesseract poppler-utils
|
| 180 |
+
```
|
| 181 |
+
---
|
| 182 |
+
|
| 183 |
+
### 3. Run the Application
|
| 184 |
+
|
| 185 |
+
With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
python app.py
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
|
| 192 |
+
|
| 193 |
+
Open this URL in your web browser to use the document redaction tool
|
| 194 |
+
|
| 195 |
+
#### Command line interface
|
| 196 |
+
|
| 197 |
+
For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
|
| 198 |
+
|
| 199 |
+
If you installed from **PyPI**, use the installed console script:
|
| 200 |
+
|
| 201 |
+
```bash
|
| 202 |
+
cli_redact --help
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
From a **repository checkout**, you can also run:
|
| 206 |
+
|
| 207 |
+
```bash
|
| 208 |
+
python cli_redact.py --help
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
#### Python package commands
|
| 212 |
+
|
| 213 |
+
For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
|
| 214 |
+
|
| 215 |
+
---
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
### 4. ⚙️ Configuration (Optional)
|
| 219 |
+
|
| 220 |
+
You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
|
| 221 |
+
|
| 222 |
+
To get started:
|
| 223 |
+
1. Locate the `example_config.env` file in the root of the project.
|
| 224 |
+
2. Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
|
| 225 |
+
3. Copy the contents from `example_config.env` into your new `config/app_config.env` file.
|
| 226 |
+
4. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
|
| 227 |
+
|
| 228 |
+
If you do not create this file, the application will run with default settings.
|
| 229 |
+
|
| 230 |
+
#### Configuration Breakdown
|
| 231 |
+
|
| 232 |
+
Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
|
| 233 |
+
|
| 234 |
+
---
|
| 235 |
+
|
| 236 |
+
#### **Local & General Settings (No AWS Required)**
|
| 237 |
+
|
| 238 |
+
These settings are useful for all users, regardless of whether you are using AWS.
|
| 239 |
+
|
| 240 |
+
* `TESSERACT_FOLDER` / `POPPLER_FOLDER`
|
| 241 |
+
* Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
|
| 242 |
+
* Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
|
| 243 |
+
* **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
|
| 244 |
+
|
| 245 |
+
* `SHOW_LANGUAGE_SELECTION=True`
|
| 246 |
+
* Set to `True` to display a language selection dropdown in the UI for OCR processing.
|
| 247 |
+
|
| 248 |
+
* `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
|
| 249 |
+
* Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
|
| 250 |
+
|
| 251 |
+
* `SESSION_OUTPUT_FOLDER=False`
|
| 252 |
+
* If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
|
| 253 |
+
|
| 254 |
+
* `DISPLAY_FILE_NAMES_IN_LOGS=False`
|
| 255 |
+
* For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
#### **AWS-Specific Settings**
|
| 260 |
+
|
| 261 |
+
These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
|
| 262 |
+
|
| 263 |
+
* `RUN_AWS_FUNCTIONS=True`
|
| 264 |
+
* **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
|
| 265 |
+
|
| 266 |
+
* **UI Options:**
|
| 267 |
+
* `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
|
| 268 |
+
* `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
|
| 269 |
+
|
| 270 |
+
* **Core AWS Configuration:**
|
| 271 |
+
* `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
|
| 272 |
+
* `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
|
| 273 |
+
|
| 274 |
+
* **AWS Logging:**
|
| 275 |
+
* `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
|
| 276 |
+
* `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
|
| 277 |
+
|
| 278 |
+
* **Advanced AWS Textract Features:**
|
| 279 |
+
* `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
|
| 280 |
+
* `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
|
| 281 |
+
* `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
|
| 282 |
+
|
| 283 |
+
* **Cost Tracking (for internal accounting):**
|
| 284 |
+
* `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
|
| 285 |
+
* `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
|
| 286 |
+
* `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
|
| 287 |
+
* `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
|
| 288 |
+
|
| 289 |
+
Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
|
| 290 |
+
|
| 291 |
+
## For agents (API quickstart)
|
| 292 |
+
|
| 293 |
+
If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
|
| 294 |
+
|
| 295 |
+
- **Discover schema**: `GET /gradio_api/info`
|
| 296 |
+
- **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
|
| 297 |
+
- **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
|
| 298 |
+
- **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
|
| 299 |
+
- **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
|
| 300 |
+
|
| 301 |
+
### Choose the correct route (prefer short `gr.api` endpoints)
|
| 302 |
+
|
| 303 |
+
Fetch `/gradio_api/info` and then prefer the simplest route that exists:
|
| 304 |
+
|
| 305 |
+
- **Apply edited review CSV to a PDF**: `/review_apply`
|
| 306 |
+
- **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
|
| 307 |
+
- **Summarise a PDF**: `/pdf_summarise`
|
| 308 |
+
- **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
|
| 309 |
+
|
| 310 |
+
If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
|
| 311 |
+
|
| 312 |
+
### Common gotchas
|
| 313 |
+
|
| 314 |
+
- **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
|
| 315 |
+
- **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
|
| 316 |
+
- **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
|
| 317 |
+
|
| 318 |
+
### Optional: MCP server
|
| 319 |
+
|
| 320 |
+
If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
|
| 321 |
+
|
| 322 |
+
**Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
|
| 323 |
+
|
| 324 |
+
To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
|
| 325 |
+
|
| 326 |
+
For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
|
| 327 |
+
|
| 328 |
+
Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
|
| 329 |
+
|
| 330 |
+
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
|
agent_routes.py
ADDED
|
@@ -0,0 +1,1167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI routes for programmatic / agent callers.
|
| 3 |
+
|
| 4 |
+
HTTP paths align with Gradio ``api_name`` values in app.py. See GET /agent/operations
|
| 5 |
+
for the full map. Uses cli_redact.main(direct_mode_args=...) where a CLI task exists.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import io
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Any, Dict, List, Optional
|
| 15 |
+
|
| 16 |
+
from fastapi import APIRouter, Depends, Header, HTTPException
|
| 17 |
+
from fastapi.responses import JSONResponse
|
| 18 |
+
from pydantic import BaseModel, Field, field_validator
|
| 19 |
+
|
| 20 |
+
from tools.config import (
|
| 21 |
+
AWS_LLM_PII_OPTION,
|
| 22 |
+
AWS_PII_OPTION,
|
| 23 |
+
INFERENCE_SERVER_PII_OPTION,
|
| 24 |
+
INPUT_FOLDER,
|
| 25 |
+
LOCAL_OCR_MODEL_OPTIONS,
|
| 26 |
+
LOCAL_PII_OPTION,
|
| 27 |
+
LOCAL_TRANSFORMERS_LLM_PII_OPTION,
|
| 28 |
+
OUTPUT_FOLDER,
|
| 29 |
+
)
|
| 30 |
+
from tools.secure_path_utils import validate_path_safety
|
| 31 |
+
|
| 32 |
+
router = APIRouter(tags=["Agent"])
|
| 33 |
+
|
| 34 |
+
REPO_ROOT = Path(__file__).resolve().parent
|
| 35 |
+
_MAX_INSTRUCTION_LEN = 16_000
|
| 36 |
+
|
| 37 |
+
# NOTE: Paths from request bodies are untrusted. Avoid Path.resolve() on untrusted
|
| 38 |
+
# input (CodeQL py/path-injection); instead normalize via os.path and enforce
|
| 39 |
+
# containment under trusted roots.
|
| 40 |
+
|
| 41 |
+
# Mirrors app.py api_name values (Gradio).
|
| 42 |
+
GRADIO_API_NAMES: tuple[str, ...] = (
|
| 43 |
+
"redact_document",
|
| 44 |
+
"load_and_prepare_documents_or_data",
|
| 45 |
+
"apply_review_redactions",
|
| 46 |
+
"review_apply",
|
| 47 |
+
"pdf_summarise",
|
| 48 |
+
"tabular_redact",
|
| 49 |
+
"word_level_ocr_text_search",
|
| 50 |
+
"redact_data",
|
| 51 |
+
"find_duplicate_pages",
|
| 52 |
+
"find_duplicate_tabular",
|
| 53 |
+
"summarise_document",
|
| 54 |
+
"combine_review_csvs",
|
| 55 |
+
"combine_review_pdfs",
|
| 56 |
+
"export_review_redaction_overlay",
|
| 57 |
+
"export_review_page_ocr_visualisation",
|
| 58 |
+
"verify_redaction_coverage",
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _allowed_path_roots() -> list[Path]:
|
| 63 |
+
# Return roots without resolving. These are trusted config values, but avoiding
|
| 64 |
+
# Path.resolve() keeps CodeQL happy and matches our "no resolve on untrusted"
|
| 65 |
+
# approach elsewhere.
|
| 66 |
+
roots = [REPO_ROOT]
|
| 67 |
+
for folder in (INPUT_FOLDER, OUTPUT_FOLDER):
|
| 68 |
+
if folder:
|
| 69 |
+
roots.append(Path(str(folder)))
|
| 70 |
+
return roots
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _sanitize_untrusted_path_input(path_str: str) -> str:
|
| 74 |
+
"""Basic raw-input validation before any path normalization."""
|
| 75 |
+
if not isinstance(path_str, str):
|
| 76 |
+
raise HTTPException(status_code=400, detail="Path must be a string.")
|
| 77 |
+
cleaned = path_str.strip()
|
| 78 |
+
if not cleaned:
|
| 79 |
+
raise HTTPException(status_code=400, detail="Path must not be empty.")
|
| 80 |
+
if "\x00" in cleaned:
|
| 81 |
+
raise HTTPException(status_code=400, detail="Path contains invalid null byte.")
|
| 82 |
+
return cleaned
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _normalize_untrusted_path_to_abs(path_str: str) -> str:
|
| 86 |
+
"""
|
| 87 |
+
Expand ~, then normalize to an absolute path.
|
| 88 |
+
|
| 89 |
+
Relative paths are interpreted relative to REPO_ROOT (matching prior behaviour).
|
| 90 |
+
"""
|
| 91 |
+
safe_input = _sanitize_untrusted_path_input(path_str)
|
| 92 |
+
expanded = os.path.expanduser(safe_input)
|
| 93 |
+
if os.path.isabs(expanded):
|
| 94 |
+
return os.path.normpath(os.path.abspath(expanded))
|
| 95 |
+
return os.path.normpath(os.path.abspath(os.path.join(str(REPO_ROOT), expanded)))
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _must_be_under_allowed_roots(candidate_abs: str, original: str) -> None:
|
| 99 |
+
"""Enforce candidate is contained under repo, INPUT_FOLDER, or OUTPUT_FOLDER."""
|
| 100 |
+
candidate_real = os.path.realpath(str(candidate_abs))
|
| 101 |
+
allowed_roots = [
|
| 102 |
+
os.path.realpath(os.path.abspath(str(p))) for p in _allowed_path_roots()
|
| 103 |
+
]
|
| 104 |
+
for root in allowed_roots:
|
| 105 |
+
try:
|
| 106 |
+
common = os.path.commonpath([candidate_real, root])
|
| 107 |
+
except ValueError:
|
| 108 |
+
# Different drive on Windows or invalid path mix
|
| 109 |
+
continue
|
| 110 |
+
if common == root:
|
| 111 |
+
return
|
| 112 |
+
raise HTTPException(
|
| 113 |
+
status_code=403,
|
| 114 |
+
detail="Path must be under the app repo, INPUT_FOLDER, or OUTPUT_FOLDER",
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _path_must_be_allowed_file(path_str: str) -> str:
|
| 119 |
+
"""Resolve path, ensure it is under an allowed root and exists as a file."""
|
| 120 |
+
candidate_abs = _normalize_untrusted_path_to_abs(path_str)
|
| 121 |
+
candidate_real = os.path.realpath(candidate_abs)
|
| 122 |
+
|
| 123 |
+
# Validate both "safe path" patterns and containment under trusted roots.
|
| 124 |
+
_must_be_under_allowed_roots(candidate_real, path_str)
|
| 125 |
+
ok = any(
|
| 126 |
+
validate_path_safety(candidate_real, base_path=str(root))
|
| 127 |
+
for root in _allowed_path_roots()
|
| 128 |
+
)
|
| 129 |
+
if not ok:
|
| 130 |
+
raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
|
| 131 |
+
try:
|
| 132 |
+
candidate_path = Path(candidate_real)
|
| 133 |
+
if not candidate_path.is_file():
|
| 134 |
+
raise HTTPException(
|
| 135 |
+
status_code=400, detail=f"Not a file or missing: {candidate_real}"
|
| 136 |
+
)
|
| 137 |
+
except OSError:
|
| 138 |
+
raise HTTPException(
|
| 139 |
+
status_code=400, detail=f"Not a file or missing: {candidate_real}"
|
| 140 |
+
)
|
| 141 |
+
return candidate_real
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _path_must_be_allowed_directory(path_str: str, *, must_exist: bool = True) -> str:
|
| 145 |
+
"""
|
| 146 |
+
Normalize and validate a directory path under allowed roots.
|
| 147 |
+
|
| 148 |
+
By default the directory must already exist; callers can opt out (e.g. output_dir
|
| 149 |
+
that will be created later by the CLI).
|
| 150 |
+
"""
|
| 151 |
+
candidate_abs = _normalize_untrusted_path_to_abs(path_str)
|
| 152 |
+
candidate_real = os.path.realpath(candidate_abs)
|
| 153 |
+
|
| 154 |
+
_must_be_under_allowed_roots(candidate_real, path_str)
|
| 155 |
+
ok = any(
|
| 156 |
+
validate_path_safety(candidate_real, base_path=str(root))
|
| 157 |
+
for root in _allowed_path_roots()
|
| 158 |
+
)
|
| 159 |
+
if not ok:
|
| 160 |
+
raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
|
| 161 |
+
if must_exist:
|
| 162 |
+
try:
|
| 163 |
+
if not Path(candidate_real).is_dir():
|
| 164 |
+
raise HTTPException(
|
| 165 |
+
status_code=400, detail=f"Not a directory: {candidate_real}"
|
| 166 |
+
)
|
| 167 |
+
except OSError:
|
| 168 |
+
raise HTTPException(
|
| 169 |
+
status_code=400, detail=f"Not a directory: {candidate_real}"
|
| 170 |
+
)
|
| 171 |
+
return candidate_real
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _optional_agent_api_key(x_agent_api_key: Optional[str] = Header(None)) -> None:
|
| 175 |
+
expected = os.environ.get("AGENT_API_KEY", "").strip()
|
| 176 |
+
if not expected:
|
| 177 |
+
return
|
| 178 |
+
if not x_agent_api_key or x_agent_api_key.strip() != expected:
|
| 179 |
+
raise HTTPException(
|
| 180 |
+
status_code=401,
|
| 181 |
+
detail="Set header X-Agent-API-Key to match AGENT_API_KEY environment variable",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
class AgentRedactDocumentRequest(BaseModel):
|
| 186 |
+
"""Parity with Gradio api_name ``redact_document``."""
|
| 187 |
+
|
| 188 |
+
input_files: list[str] = Field(
|
| 189 |
+
...,
|
| 190 |
+
min_length=1,
|
| 191 |
+
description="Paths to input files (PDF, images, or tabular/Word for anonymisation)",
|
| 192 |
+
)
|
| 193 |
+
instruction: Optional[str] = Field(
|
| 194 |
+
None,
|
| 195 |
+
description="Optional instructions for LLM-based PII detection (custom_llm_instructions)",
|
| 196 |
+
)
|
| 197 |
+
output_dir: Optional[str] = None
|
| 198 |
+
input_dir: Optional[str] = None
|
| 199 |
+
ocr_method: Optional[str] = Field(
|
| 200 |
+
None,
|
| 201 |
+
description=(
|
| 202 |
+
"High-level OCR/text mode. Accepted values: 'Local OCR', "
|
| 203 |
+
"'AWS Textract', 'Local text'. To choose a specific local OCR engine "
|
| 204 |
+
"(e.g. paddle/tesseract/vlm), set "
|
| 205 |
+
"overrides.chosen_local_ocr_model."
|
| 206 |
+
),
|
| 207 |
+
)
|
| 208 |
+
pii_detector: Optional[str] = Field(
|
| 209 |
+
None,
|
| 210 |
+
description=(
|
| 211 |
+
"PII detection method. Recommended configured labels: "
|
| 212 |
+
f"'{LOCAL_PII_OPTION}', '{AWS_PII_OPTION}', '{AWS_LLM_PII_OPTION}', "
|
| 213 |
+
f"'{INFERENCE_SERVER_PII_OPTION}', '{LOCAL_TRANSFORMERS_LLM_PII_OPTION}', "
|
| 214 |
+
"'None'."
|
| 215 |
+
),
|
| 216 |
+
)
|
| 217 |
+
overrides: Optional[dict[str, Any]] = Field(
|
| 218 |
+
None,
|
| 219 |
+
description=(
|
| 220 |
+
"Optional CLI flag overrides; keys must match argparse destination names. "
|
| 221 |
+
"For local OCR model selection, set 'chosen_local_ocr_model' "
|
| 222 |
+
f"(allowed models depend on deployment; configured options: {LOCAL_OCR_MODEL_OPTIONS})."
|
| 223 |
+
),
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
model_config = {
|
| 227 |
+
"json_schema_extra": {
|
| 228 |
+
"examples": [
|
| 229 |
+
{
|
| 230 |
+
"input_files": [
|
| 231 |
+
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
|
| 232 |
+
],
|
| 233 |
+
"instruction": "Do not redact the university name.",
|
| 234 |
+
"ocr_method": "Local OCR",
|
| 235 |
+
"pii_detector": LOCAL_PII_OPTION,
|
| 236 |
+
"overrides": {"chosen_local_ocr_model": "paddle"},
|
| 237 |
+
}
|
| 238 |
+
]
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
@field_validator("instruction")
|
| 243 |
+
@classmethod
|
| 244 |
+
def _cap_instruction(cls, v: Optional[str]) -> Optional[str]:
|
| 245 |
+
if v is None:
|
| 246 |
+
return v
|
| 247 |
+
if len(v) > _MAX_INSTRUCTION_LEN:
|
| 248 |
+
raise ValueError(f"instruction exceeds {_MAX_INSTRUCTION_LEN} characters")
|
| 249 |
+
return v
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class AgentRedactDataRequest(AgentRedactDocumentRequest):
|
| 253 |
+
"""Parity with Gradio api_name ``redact_data``; same CLI task as redact_document."""
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
class AgentTaskResponse(BaseModel):
|
| 257 |
+
status: str
|
| 258 |
+
gradio_api_name: str
|
| 259 |
+
task: str
|
| 260 |
+
output_dir: str
|
| 261 |
+
input_dir: str
|
| 262 |
+
message: str
|
| 263 |
+
log_excerpt: Optional[str] = None
|
| 264 |
+
output_paths: Optional[list[str]] = None
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
class AgentVerifyRedactionRequest(BaseModel):
|
| 268 |
+
review_csv_path: str = Field(..., description="Path to *_review_file.csv")
|
| 269 |
+
ocr_words_csv_path: str = Field(
|
| 270 |
+
..., description="Path to *_ocr_results_with_words_*.csv from the same run"
|
| 271 |
+
)
|
| 272 |
+
must_redact: Optional[List[str]] = Field(
|
| 273 |
+
None,
|
| 274 |
+
description="Regex patterns for terms that must be covered by review boxes.",
|
| 275 |
+
)
|
| 276 |
+
must_not_redact: Optional[List[str]] = Field(
|
| 277 |
+
None,
|
| 278 |
+
description="Regex patterns for terms that must not appear in review rows.",
|
| 279 |
+
)
|
| 280 |
+
redacted_pdf_path: Optional[str] = Field(
|
| 281 |
+
None, description="Optional applied *_redacted.pdf for text-layer leak checks."
|
| 282 |
+
)
|
| 283 |
+
total_pages: Optional[int] = Field(None, ge=1)
|
| 284 |
+
min_word_length: int = Field(3, ge=1, le=32)
|
| 285 |
+
sample_pixels: bool = Field(
|
| 286 |
+
False,
|
| 287 |
+
description="Sample pixel darkness at box centres on redacted PDF (requires redacted_pdf_path).",
|
| 288 |
+
)
|
| 289 |
+
auto_prune_suspicious: bool = Field(
|
| 290 |
+
False,
|
| 291 |
+
description="Remove prunable suspicious short/OCR-fragment rows and write pruned CSV.",
|
| 292 |
+
)
|
| 293 |
+
pruned_output_path: Optional[str] = Field(
|
| 294 |
+
None,
|
| 295 |
+
description="Output path for pruned CSV when auto_prune_suspicious is true.",
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
class AgentVerifyRedactionResponse(BaseModel):
|
| 300 |
+
status: str
|
| 301 |
+
gradio_api_name: str = "verify_redaction_coverage"
|
| 302 |
+
coverage_pass: bool
|
| 303 |
+
coverage_pass_strict: bool
|
| 304 |
+
coverage_pass_with_cleanup: bool
|
| 305 |
+
pruned_csv_path: Optional[str] = None
|
| 306 |
+
prune_log: Optional[Dict[str, Any]] = None
|
| 307 |
+
report: Dict[str, Any]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
class AgentWordLevelOcrSearchRequest(BaseModel):
|
| 311 |
+
ocr_words_csv_path: str = Field(
|
| 312 |
+
..., description="Path to *_ocr_results_with_words_*.csv"
|
| 313 |
+
)
|
| 314 |
+
search_text: str = Field(..., min_length=3, max_length=500)
|
| 315 |
+
similarity_threshold: float = Field(1.0, ge=0.0, le=1.0)
|
| 316 |
+
use_regex: bool = False
|
| 317 |
+
review_csv_path: Optional[str] = Field(
|
| 318 |
+
None,
|
| 319 |
+
description="Optional *_review_file.csv to flag whether each hit is covered by a box.",
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
class AgentWordLevelOcrSearchResponse(BaseModel):
|
| 324 |
+
status: str
|
| 325 |
+
gradio_api_name: str = "word_level_ocr_text_search"
|
| 326 |
+
result: Dict[str, Any]
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _merge_redact_direct_mode(body: AgentRedactDocumentRequest) -> dict[str, Any]:
|
| 330 |
+
from cli_redact import get_cli_default_args_dict
|
| 331 |
+
|
| 332 |
+
merged: dict[str, Any] = get_cli_default_args_dict()
|
| 333 |
+
merged["task"] = "redact"
|
| 334 |
+
merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
|
| 335 |
+
|
| 336 |
+
if body.instruction is not None:
|
| 337 |
+
merged["custom_llm_instructions"] = body.instruction
|
| 338 |
+
if body.output_dir is not None:
|
| 339 |
+
# Output folders may not exist yet (CLI will create). Still constrain to allowed roots.
|
| 340 |
+
merged["output_dir"] = _path_must_be_allowed_directory(
|
| 341 |
+
body.output_dir, must_exist=False
|
| 342 |
+
)
|
| 343 |
+
if body.input_dir is not None:
|
| 344 |
+
# Input dir should exist if provided.
|
| 345 |
+
merged["input_dir"] = _path_must_be_allowed_directory(
|
| 346 |
+
body.input_dir, must_exist=True
|
| 347 |
+
)
|
| 348 |
+
if body.ocr_method is not None:
|
| 349 |
+
merged["ocr_method"] = body.ocr_method
|
| 350 |
+
if body.pii_detector is not None:
|
| 351 |
+
merged["pii_detector"] = body.pii_detector
|
| 352 |
+
|
| 353 |
+
if body.overrides:
|
| 354 |
+
allowed = set(merged.keys())
|
| 355 |
+
for key, value in body.overrides.items():
|
| 356 |
+
if key not in allowed:
|
| 357 |
+
raise HTTPException(
|
| 358 |
+
status_code=400,
|
| 359 |
+
detail=f"Unknown override key '{key}'. Must be a known CLI argument name.",
|
| 360 |
+
)
|
| 361 |
+
merged[key] = value
|
| 362 |
+
|
| 363 |
+
return merged
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def _run_cli_main(direct: dict[str, Any], gradio_api_name: str) -> AgentTaskResponse:
|
| 367 |
+
from cli_redact import main as cli_main
|
| 368 |
+
|
| 369 |
+
buf = io.StringIO()
|
| 370 |
+
old_stdout = sys.stdout
|
| 371 |
+
try:
|
| 372 |
+
sys.stdout = buf
|
| 373 |
+
cli_main(direct_mode_args=direct)
|
| 374 |
+
except Exception as e:
|
| 375 |
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
| 376 |
+
finally:
|
| 377 |
+
sys.stdout = old_stdout
|
| 378 |
+
|
| 379 |
+
log_excerpt = buf.getvalue()
|
| 380 |
+
if len(log_excerpt) > 8000:
|
| 381 |
+
log_excerpt = log_excerpt[-8000:]
|
| 382 |
+
|
| 383 |
+
return AgentTaskResponse(
|
| 384 |
+
status="completed",
|
| 385 |
+
gradio_api_name=gradio_api_name,
|
| 386 |
+
task=str(direct.get("task", "")),
|
| 387 |
+
output_dir=str(direct.get("output_dir", "")),
|
| 388 |
+
input_dir=str(direct.get("input_dir", "")),
|
| 389 |
+
message="cli_redact.main finished; see log_excerpt for console output",
|
| 390 |
+
log_excerpt=log_excerpt or None,
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
@router.post(
|
| 395 |
+
"/redact_document",
|
| 396 |
+
response_model=AgentTaskResponse,
|
| 397 |
+
summary="redact_document (Gradio api_name)",
|
| 398 |
+
description=(
|
| 399 |
+
"Matches Gradio ``api_name='redact_document'``. "
|
| 400 |
+
"``python cli_redact.py --task redact --input_file ...``. "
|
| 401 |
+
"Optional ``instruction`` maps to ``custom_llm_instructions``. "
|
| 402 |
+
"OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
|
| 403 |
+
"Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
|
| 404 |
+
f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
|
| 405 |
+
"PII methods should use configured labels shown on the request schema."
|
| 406 |
+
),
|
| 407 |
+
)
|
| 408 |
+
def post_redact_document(
|
| 409 |
+
body: AgentRedactDocumentRequest,
|
| 410 |
+
_: None = Depends(_optional_agent_api_key),
|
| 411 |
+
) -> AgentTaskResponse:
|
| 412 |
+
direct = _merge_redact_direct_mode(body)
|
| 413 |
+
return _run_cli_main(direct, "redact_document")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
@router.post(
|
| 417 |
+
"/redact_data",
|
| 418 |
+
response_model=AgentTaskResponse,
|
| 419 |
+
summary="redact_data (Gradio api_name)",
|
| 420 |
+
description=(
|
| 421 |
+
"Matches Gradio ``api_name='redact_data'``. Same CLI ``redact`` task as "
|
| 422 |
+
"/redact_document; use CSV/XLSX/DOCX paths for tabular/Word flows. "
|
| 423 |
+
"OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
|
| 424 |
+
"Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
|
| 425 |
+
f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
|
| 426 |
+
"PII methods should use configured labels shown on the request schema."
|
| 427 |
+
),
|
| 428 |
+
)
|
| 429 |
+
def post_redact_data(
|
| 430 |
+
body: AgentRedactDataRequest,
|
| 431 |
+
_: None = Depends(_optional_agent_api_key),
|
| 432 |
+
) -> AgentTaskResponse:
|
| 433 |
+
direct = _merge_redact_direct_mode(body)
|
| 434 |
+
return _run_cli_main(direct, "redact_data")
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
@router.post(
|
| 438 |
+
"/tasks/redact",
|
| 439 |
+
response_model=AgentTaskResponse,
|
| 440 |
+
summary="Legacy: same as /redact_document",
|
| 441 |
+
description="Deprecated alias; prefer POST /agent/redact_document.",
|
| 442 |
+
deprecated=True,
|
| 443 |
+
include_in_schema=True,
|
| 444 |
+
)
|
| 445 |
+
def post_tasks_redact_legacy(
|
| 446 |
+
body: AgentRedactDocumentRequest,
|
| 447 |
+
_: None = Depends(_optional_agent_api_key),
|
| 448 |
+
) -> AgentTaskResponse:
|
| 449 |
+
direct = _merge_redact_direct_mode(body)
|
| 450 |
+
return _run_cli_main(direct, "redact_document")
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
class AgentFindDuplicatePagesRequest(BaseModel):
|
| 454 |
+
input_files: list[str] = Field(..., min_length=1)
|
| 455 |
+
similarity_threshold: Optional[float] = None
|
| 456 |
+
min_word_count: Optional[int] = None
|
| 457 |
+
min_consecutive_pages: Optional[int] = None
|
| 458 |
+
greedy_match: Optional[bool] = None
|
| 459 |
+
combine_pages: Optional[bool] = None
|
| 460 |
+
overrides: Optional[dict[str, Any]] = None
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
@router.post(
|
| 464 |
+
"/find_duplicate_pages",
|
| 465 |
+
response_model=AgentTaskResponse,
|
| 466 |
+
summary="find_duplicate_pages (Gradio api_name)",
|
| 467 |
+
description="``cli_redact --task deduplicate --duplicate_type pages``.",
|
| 468 |
+
)
|
| 469 |
+
def post_find_duplicate_pages(
|
| 470 |
+
body: AgentFindDuplicatePagesRequest,
|
| 471 |
+
_: None = Depends(_optional_agent_api_key),
|
| 472 |
+
) -> AgentTaskResponse:
|
| 473 |
+
from cli_redact import get_cli_default_args_dict
|
| 474 |
+
|
| 475 |
+
merged = get_cli_default_args_dict()
|
| 476 |
+
merged["task"] = "deduplicate"
|
| 477 |
+
merged["duplicate_type"] = "pages"
|
| 478 |
+
merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
|
| 479 |
+
if body.similarity_threshold is not None:
|
| 480 |
+
merged["similarity_threshold"] = body.similarity_threshold
|
| 481 |
+
if body.min_word_count is not None:
|
| 482 |
+
merged["min_word_count"] = body.min_word_count
|
| 483 |
+
if body.min_consecutive_pages is not None:
|
| 484 |
+
merged["min_consecutive_pages"] = body.min_consecutive_pages
|
| 485 |
+
if body.greedy_match is not None:
|
| 486 |
+
merged["greedy_match"] = "True" if body.greedy_match else "False"
|
| 487 |
+
if body.combine_pages is not None:
|
| 488 |
+
merged["combine_pages"] = "True" if body.combine_pages else "False"
|
| 489 |
+
if body.overrides:
|
| 490 |
+
allowed = set(merged.keys())
|
| 491 |
+
for k, v in body.overrides.items():
|
| 492 |
+
if k not in allowed:
|
| 493 |
+
raise HTTPException(400, f"Unknown override key: {k}")
|
| 494 |
+
merged[k] = v
|
| 495 |
+
return _run_cli_main(merged, "find_duplicate_pages")
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
class AgentFindDuplicateTabularRequest(BaseModel):
|
| 499 |
+
input_files: list[str] = Field(..., min_length=1)
|
| 500 |
+
text_columns: Optional[list[str]] = None
|
| 501 |
+
similarity_threshold: Optional[float] = None
|
| 502 |
+
min_word_count: Optional[int] = None
|
| 503 |
+
overrides: Optional[dict[str, Any]] = None
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
@router.post(
|
| 507 |
+
"/find_duplicate_tabular",
|
| 508 |
+
response_model=AgentTaskResponse,
|
| 509 |
+
summary="find_duplicate_tabular (Gradio api_name)",
|
| 510 |
+
)
|
| 511 |
+
def post_find_duplicate_tabular(
|
| 512 |
+
body: AgentFindDuplicateTabularRequest,
|
| 513 |
+
_: None = Depends(_optional_agent_api_key),
|
| 514 |
+
) -> AgentTaskResponse:
|
| 515 |
+
from cli_redact import get_cli_default_args_dict
|
| 516 |
+
|
| 517 |
+
merged = get_cli_default_args_dict()
|
| 518 |
+
merged["task"] = "deduplicate"
|
| 519 |
+
merged["duplicate_type"] = "tabular"
|
| 520 |
+
merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
|
| 521 |
+
if body.text_columns is not None:
|
| 522 |
+
merged["text_columns"] = body.text_columns
|
| 523 |
+
if body.similarity_threshold is not None:
|
| 524 |
+
merged["similarity_threshold"] = body.similarity_threshold
|
| 525 |
+
if body.min_word_count is not None:
|
| 526 |
+
merged["min_word_count"] = body.min_word_count
|
| 527 |
+
if body.overrides:
|
| 528 |
+
allowed = set(merged.keys())
|
| 529 |
+
for k, v in body.overrides.items():
|
| 530 |
+
if k not in allowed:
|
| 531 |
+
raise HTTPException(400, f"Unknown override key: {k}")
|
| 532 |
+
merged[k] = v
|
| 533 |
+
return _run_cli_main(merged, "find_duplicate_tabular")
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
class AgentSummariseDocumentRequest(BaseModel):
|
| 537 |
+
input_files: list[str] = Field(..., min_length=1)
|
| 538 |
+
summarisation_inference_method: Optional[str] = None
|
| 539 |
+
summarisation_format: Optional[str] = None
|
| 540 |
+
summarisation_context: Optional[str] = None
|
| 541 |
+
summarisation_additional_instructions: Optional[str] = None
|
| 542 |
+
overrides: Optional[dict[str, Any]] = None
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
@router.post(
|
| 546 |
+
"/summarise_document",
|
| 547 |
+
response_model=AgentTaskResponse,
|
| 548 |
+
summary="summarise_document (Gradio api_name)",
|
| 549 |
+
)
|
| 550 |
+
def post_summarise_document(
|
| 551 |
+
body: AgentSummariseDocumentRequest,
|
| 552 |
+
_: None = Depends(_optional_agent_api_key),
|
| 553 |
+
) -> AgentTaskResponse:
|
| 554 |
+
from cli_redact import get_cli_default_args_dict
|
| 555 |
+
|
| 556 |
+
merged = get_cli_default_args_dict()
|
| 557 |
+
merged["task"] = "summarise"
|
| 558 |
+
merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
|
| 559 |
+
if body.summarisation_inference_method is not None:
|
| 560 |
+
merged["summarisation_inference_method"] = body.summarisation_inference_method
|
| 561 |
+
if body.summarisation_format is not None:
|
| 562 |
+
merged["summarisation_format"] = body.summarisation_format
|
| 563 |
+
if body.summarisation_context is not None:
|
| 564 |
+
merged["summarisation_context"] = body.summarisation_context
|
| 565 |
+
if body.summarisation_additional_instructions is not None:
|
| 566 |
+
merged["summarisation_additional_instructions"] = (
|
| 567 |
+
body.summarisation_additional_instructions
|
| 568 |
+
)
|
| 569 |
+
if body.overrides:
|
| 570 |
+
allowed = set(merged.keys())
|
| 571 |
+
for k, v in body.overrides.items():
|
| 572 |
+
if k not in allowed:
|
| 573 |
+
raise HTTPException(400, f"Unknown override key: {k}")
|
| 574 |
+
merged[k] = v
|
| 575 |
+
return _run_cli_main(merged, "summarise_document")
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
class AgentCombineReviewPdfsRequest(BaseModel):
|
| 579 |
+
input_files: list[str] = Field(..., min_length=2)
|
| 580 |
+
output_dir: Optional[str] = None
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
@router.post(
|
| 584 |
+
"/combine_review_pdfs",
|
| 585 |
+
response_model=AgentTaskResponse,
|
| 586 |
+
summary="combine_review_pdfs (Gradio api_name)",
|
| 587 |
+
)
|
| 588 |
+
def post_combine_review_pdfs(
|
| 589 |
+
body: AgentCombineReviewPdfsRequest,
|
| 590 |
+
_: None = Depends(_optional_agent_api_key),
|
| 591 |
+
) -> AgentTaskResponse:
|
| 592 |
+
from cli_redact import get_cli_default_args_dict
|
| 593 |
+
|
| 594 |
+
merged = get_cli_default_args_dict()
|
| 595 |
+
merged["task"] = "combine_review_pdfs"
|
| 596 |
+
merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
|
| 597 |
+
if body.output_dir is not None:
|
| 598 |
+
merged["output_dir"] = _path_must_be_allowed_directory(body.output_dir)
|
| 599 |
+
return _run_cli_main(merged, "combine_review_pdfs")
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
class _NamedPath:
|
| 603 |
+
"""merge_csv_files expects objects with a .name attribute (Gradio file-like)."""
|
| 604 |
+
|
| 605 |
+
__slots__ = ("name",)
|
| 606 |
+
|
| 607 |
+
def __init__(self, path: str) -> None:
|
| 608 |
+
self.name = path
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
class AgentCombineReviewCsvsRequest(BaseModel):
|
| 612 |
+
input_files: list[str] = Field(..., min_length=1)
|
| 613 |
+
output_dir: Optional[str] = Field(
|
| 614 |
+
None, description="Defaults to config OUTPUT_FOLDER"
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
class AgentApplyReviewRedactionsRequest(BaseModel):
|
| 619 |
+
"""Headless parity with Gradio ``api_name='apply_review_redactions'`` (prepare + apply)."""
|
| 620 |
+
|
| 621 |
+
pdf_path: str = Field(
|
| 622 |
+
...,
|
| 623 |
+
description="Path to the source PDF under allowed roots.",
|
| 624 |
+
)
|
| 625 |
+
review_csv_path: str = Field(
|
| 626 |
+
...,
|
| 627 |
+
description=(
|
| 628 |
+
"Path to the review plan CSV; basename must contain '_review_file' "
|
| 629 |
+
"(e.g. mydoc_review_file.csv)."
|
| 630 |
+
),
|
| 631 |
+
)
|
| 632 |
+
output_dir: Optional[str] = Field(
|
| 633 |
+
None,
|
| 634 |
+
description="Output directory (created if missing); defaults to OUTPUT_FOLDER.",
|
| 635 |
+
)
|
| 636 |
+
input_dir: Optional[str] = Field(
|
| 637 |
+
None,
|
| 638 |
+
description="Input/working directory for page images; defaults to INPUT_FOLDER.",
|
| 639 |
+
)
|
| 640 |
+
text_extract_method: Optional[str] = Field(
|
| 641 |
+
None,
|
| 642 |
+
description="OCR/text mode passed to prepare (defaults to CLI ocr_method).",
|
| 643 |
+
)
|
| 644 |
+
efficient_ocr: Optional[bool] = Field(
|
| 645 |
+
None,
|
| 646 |
+
description="If set, overrides EFFICIENT_OCR for the prepare step.",
|
| 647 |
+
)
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
@router.post(
|
| 651 |
+
"/combine_review_csvs",
|
| 652 |
+
response_model=AgentTaskResponse,
|
| 653 |
+
summary="combine_review_csvs (Gradio api_name)",
|
| 654 |
+
description="Uses tools.helper_functions.merge_csv_files (not cli_redact).",
|
| 655 |
+
)
|
| 656 |
+
def post_combine_review_csvs(
|
| 657 |
+
body: AgentCombineReviewCsvsRequest,
|
| 658 |
+
_: None = Depends(_optional_agent_api_key),
|
| 659 |
+
) -> AgentTaskResponse:
|
| 660 |
+
from tools.helper_functions import merge_csv_files
|
| 661 |
+
|
| 662 |
+
paths = [_NamedPath(_path_must_be_allowed_file(p)) for p in body.input_files]
|
| 663 |
+
out_dir = body.output_dir or OUTPUT_FOLDER
|
| 664 |
+
out_dir_resolved = _path_must_be_allowed_directory(str(out_dir), must_exist=True)
|
| 665 |
+
sep = "/" if not out_dir_resolved.endswith(("/", "\\")) else ""
|
| 666 |
+
out_files = merge_csv_files(paths, output_folder=out_dir_resolved + sep)
|
| 667 |
+
return AgentTaskResponse(
|
| 668 |
+
status="completed",
|
| 669 |
+
gradio_api_name="combine_review_csvs",
|
| 670 |
+
task="combine_review_csvs",
|
| 671 |
+
output_dir=out_dir_resolved,
|
| 672 |
+
input_dir="",
|
| 673 |
+
message="merge_csv_files completed",
|
| 674 |
+
output_paths=out_files,
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
class AgentExportReviewRedactionOverlayRequest(BaseModel):
|
| 679 |
+
"""Agent JSON body for the same overlay render as Gradio ``api_name='page_redaction_review_image'``."""
|
| 680 |
+
|
| 681 |
+
page_image_path: str = Field(
|
| 682 |
+
...,
|
| 683 |
+
description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
|
| 684 |
+
)
|
| 685 |
+
boxes: List[Dict[str, Any]] = Field(
|
| 686 |
+
...,
|
| 687 |
+
min_length=1,
|
| 688 |
+
description="Annotator-style boxes: label, color, xmin, ymin, xmax, ymax (normalized 0–1).",
|
| 689 |
+
)
|
| 690 |
+
page_number: int = Field(
|
| 691 |
+
1, ge=1, description="1-based page index for the output filename."
|
| 692 |
+
)
|
| 693 |
+
doc_base_name: str = Field(
|
| 694 |
+
"review",
|
| 695 |
+
description="Basename for output file (e.g. document name without extension).",
|
| 696 |
+
)
|
| 697 |
+
review_df_records: Optional[List[Dict[str, Any]]] = Field(
|
| 698 |
+
None,
|
| 699 |
+
description="Optional rows (include at least 'label') for stable label→line-pattern mapping.",
|
| 700 |
+
)
|
| 701 |
+
label_abbrev_chars: Optional[int] = Field(
|
| 702 |
+
None,
|
| 703 |
+
ge=0,
|
| 704 |
+
le=24,
|
| 705 |
+
description="Draw this many leading characters of each label on the image; omit to use REVIEW_OVERLAY_LABEL_ABBREV_CHARS from config (0 = off).",
|
| 706 |
+
)
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
class AgentExportReviewPageOcrVisualisationRequest(BaseModel):
|
| 710 |
+
"""Agent JSON body for the same OCR visualisation as Gradio ``api_name='page_ocr_review_image'``."""
|
| 711 |
+
|
| 712 |
+
page_image_path: str = Field(
|
| 713 |
+
...,
|
| 714 |
+
description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
|
| 715 |
+
)
|
| 716 |
+
ocr_results: Dict[str, Any] = Field(
|
| 717 |
+
...,
|
| 718 |
+
description="Word-level OCR results dict (line_key -> {words:[{text, bounding_box, conf, ...}]}).",
|
| 719 |
+
)
|
| 720 |
+
page_number: int = Field(
|
| 721 |
+
1, ge=1, description="1-based page index (used for naming)."
|
| 722 |
+
)
|
| 723 |
+
doc_base_name: str = Field(
|
| 724 |
+
"review",
|
| 725 |
+
description="Basename for output file (e.g. document name without extension).",
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
@router.post(
|
| 730 |
+
"/export_review_redaction_overlay",
|
| 731 |
+
response_model=AgentTaskResponse,
|
| 732 |
+
summary="export_review_redaction_overlay (Agent API; Gradio api_name: page_redaction_review_image)",
|
| 733 |
+
description=(
|
| 734 |
+
"Renders hollow redaction outlines and a top-right legend on the page image; "
|
| 735 |
+
"writes ``redaction_overlay/{doc_base_name}_page{n}_redaction_overlay.jpg`` under OUTPUT_FOLDER "
|
| 736 |
+
"(scaled per REVIEW_OVERLAY_MAX_PIXELS, JPEG capped by REVIEW_OVERLAY_MAX_FILE_BYTES). "
|
| 737 |
+
"Uses ``tools.redaction_review.visualise_review_redaction_boxes``."
|
| 738 |
+
),
|
| 739 |
+
)
|
| 740 |
+
def post_export_review_redaction_overlay(
|
| 741 |
+
body: AgentExportReviewRedactionOverlayRequest,
|
| 742 |
+
_: None = Depends(_optional_agent_api_key),
|
| 743 |
+
) -> AgentTaskResponse:
|
| 744 |
+
import pandas as pd
|
| 745 |
+
|
| 746 |
+
from tools.redaction_review import visualise_review_redaction_boxes
|
| 747 |
+
|
| 748 |
+
img_path = _path_must_be_allowed_file(body.page_image_path)
|
| 749 |
+
annotator: dict[str, Any] = {"image": img_path, "boxes": body.boxes}
|
| 750 |
+
review_df = (
|
| 751 |
+
pd.DataFrame(body.review_df_records)
|
| 752 |
+
if body.review_df_records
|
| 753 |
+
else pd.DataFrame()
|
| 754 |
+
)
|
| 755 |
+
out_folder_abs = os.path.realpath(
|
| 756 |
+
os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
|
| 757 |
+
)
|
| 758 |
+
if not validate_path_safety(out_folder_abs):
|
| 759 |
+
raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
|
| 760 |
+
_must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
|
| 761 |
+
try:
|
| 762 |
+
Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
|
| 763 |
+
except OSError:
|
| 764 |
+
raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
|
| 765 |
+
out_folder = out_folder_abs
|
| 766 |
+
|
| 767 |
+
path = visualise_review_redaction_boxes(
|
| 768 |
+
annotator,
|
| 769 |
+
review_df=review_df,
|
| 770 |
+
output_folder=out_folder,
|
| 771 |
+
page_number=body.page_number,
|
| 772 |
+
doc_base_name=body.doc_base_name,
|
| 773 |
+
label_abbrev_chars=body.label_abbrev_chars,
|
| 774 |
+
)
|
| 775 |
+
if not path:
|
| 776 |
+
raise HTTPException(
|
| 777 |
+
status_code=500,
|
| 778 |
+
detail=(
|
| 779 |
+
"Could not produce overlay PNG (invalid image/boxes or write failed). "
|
| 780 |
+
"Ensure boxes are valid and the image loads."
|
| 781 |
+
),
|
| 782 |
+
)
|
| 783 |
+
return AgentTaskResponse(
|
| 784 |
+
status="completed",
|
| 785 |
+
gradio_api_name="export_review_redaction_overlay",
|
| 786 |
+
task="export_review_redaction_overlay",
|
| 787 |
+
output_dir=out_folder,
|
| 788 |
+
input_dir="",
|
| 789 |
+
message="Redaction overlay PNG written",
|
| 790 |
+
output_paths=[path],
|
| 791 |
+
)
|
| 792 |
+
|
| 793 |
+
|
| 794 |
+
@router.post(
|
| 795 |
+
"/export_review_page_ocr_visualisation",
|
| 796 |
+
response_model=AgentTaskResponse,
|
| 797 |
+
summary="export_review_page_ocr_visualisation (Agent API; Gradio api_name: page_ocr_review_image)",
|
| 798 |
+
description=(
|
| 799 |
+
"Renders a per-page OCR visualisation using tools.file_redaction.visualise_ocr_words_bounding_boxes; "
|
| 800 |
+
"writes under OUTPUT_FOLDER/review_ocr_visualisations/."
|
| 801 |
+
),
|
| 802 |
+
)
|
| 803 |
+
def post_export_review_page_ocr_visualisation(
|
| 804 |
+
body: AgentExportReviewPageOcrVisualisationRequest,
|
| 805 |
+
_: None = Depends(_optional_agent_api_key),
|
| 806 |
+
) -> AgentTaskResponse:
|
| 807 |
+
from PIL import Image
|
| 808 |
+
|
| 809 |
+
from tools.file_redaction import visualise_ocr_words_bounding_boxes
|
| 810 |
+
|
| 811 |
+
img_path = _path_must_be_allowed_file(body.page_image_path)
|
| 812 |
+
|
| 813 |
+
out_folder_abs = os.path.realpath(
|
| 814 |
+
os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
|
| 815 |
+
)
|
| 816 |
+
if not validate_path_safety(out_folder_abs):
|
| 817 |
+
raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
|
| 818 |
+
_must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
|
| 819 |
+
try:
|
| 820 |
+
Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
|
| 821 |
+
except OSError:
|
| 822 |
+
raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
|
| 823 |
+
out_folder = out_folder_abs
|
| 824 |
+
|
| 825 |
+
safe_base = str(body.doc_base_name or "review")
|
| 826 |
+
image_name = f"{safe_base}_page{int(body.page_number)}.png"
|
| 827 |
+
log_paths: list[str] = []
|
| 828 |
+
try:
|
| 829 |
+
log_paths = visualise_ocr_words_bounding_boxes(
|
| 830 |
+
Image.open(img_path).convert("RGB"),
|
| 831 |
+
body.ocr_results,
|
| 832 |
+
image_name=image_name,
|
| 833 |
+
output_folder=out_folder,
|
| 834 |
+
visualisation_folder="review_ocr_visualisations",
|
| 835 |
+
add_legend=True,
|
| 836 |
+
log_files_output_paths=log_paths,
|
| 837 |
+
)
|
| 838 |
+
except Exception as e:
|
| 839 |
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
| 840 |
+
|
| 841 |
+
if not log_paths:
|
| 842 |
+
raise HTTPException(
|
| 843 |
+
status_code=500,
|
| 844 |
+
detail="Could not produce OCR visualisation (invalid image/ocr_results or write failed).",
|
| 845 |
+
)
|
| 846 |
+
out_path = log_paths[-1]
|
| 847 |
+
return AgentTaskResponse(
|
| 848 |
+
status="completed",
|
| 849 |
+
gradio_api_name="export_review_page_ocr_visualisation",
|
| 850 |
+
task="export_review_page_ocr_visualisation",
|
| 851 |
+
output_dir=out_folder,
|
| 852 |
+
input_dir="",
|
| 853 |
+
message="OCR visualisation written",
|
| 854 |
+
output_paths=[out_path],
|
| 855 |
+
)
|
| 856 |
+
|
| 857 |
+
|
| 858 |
+
def _gradio_only(api_name: str, detail: str) -> JSONResponse:
|
| 859 |
+
return JSONResponse(
|
| 860 |
+
status_code=501,
|
| 861 |
+
content={
|
| 862 |
+
"gradio_api_name": api_name,
|
| 863 |
+
"detail": detail,
|
| 864 |
+
"hint": (
|
| 865 |
+
"This flow is Gradio-session stateful. Call the named route on the "
|
| 866 |
+
"Gradio HTTP API, not /agent."
|
| 867 |
+
),
|
| 868 |
+
"gradio_http": {
|
| 869 |
+
"discover_schema": "GET /gradio_api/info",
|
| 870 |
+
"start_call": f"POST /gradio_api/call/{api_name}",
|
| 871 |
+
"request_body_shape": '{"data": [<args in schema order>]}',
|
| 872 |
+
"poll": f"GET /gradio_api/call/{api_name}/{{event_id}}",
|
| 873 |
+
},
|
| 874 |
+
"gradio_client_notes": [
|
| 875 |
+
"Pass api_name explicitly; do not rely on inferring the endpoint from "
|
| 876 |
+
"Python function names (large Blocks apps will look ambiguous).",
|
| 877 |
+
"If predict() still cannot resolve the route, open GET /gradio_api/info "
|
| 878 |
+
"and use the numeric fn_index with gradio_client, or call the HTTP "
|
| 879 |
+
"endpoints directly.",
|
| 880 |
+
"The length of data must match the parameter list for this deployment; "
|
| 881 |
+
"copy order and types from /gradio_api/info.",
|
| 882 |
+
],
|
| 883 |
+
},
|
| 884 |
+
)
|
| 885 |
+
|
| 886 |
+
|
| 887 |
+
@router.post("/load_and_prepare_documents_or_data")
|
| 888 |
+
def post_load_and_prepare_documents_or_data() -> JSONResponse:
|
| 889 |
+
return _gradio_only(
|
| 890 |
+
"load_and_prepare_documents_or_data",
|
| 891 |
+
"Preparation uses Gradio session state and prepare_image_or_pdf_with_efficient_ocr; no single CLI task.",
|
| 892 |
+
)
|
| 893 |
+
|
| 894 |
+
|
| 895 |
+
@router.post(
|
| 896 |
+
"/apply_review_redactions",
|
| 897 |
+
response_model=AgentTaskResponse,
|
| 898 |
+
summary="apply_review_redactions (Gradio api_name)",
|
| 899 |
+
description=(
|
| 900 |
+
"Runs prepare_image_or_pdf_with_efficient_ocr([pdf, review_csv]) then "
|
| 901 |
+
"apply_redactions_to_review_df_and_files — same core pipeline as the Review tab, "
|
| 902 |
+
"without Gradio session state. Requires paths under allowed roots."
|
| 903 |
+
),
|
| 904 |
+
)
|
| 905 |
+
def post_apply_review_redactions(
|
| 906 |
+
body: AgentApplyReviewRedactionsRequest,
|
| 907 |
+
_: None = Depends(_optional_agent_api_key),
|
| 908 |
+
) -> AgentTaskResponse:
|
| 909 |
+
from tools.simplified_api import run_apply_review_redactions
|
| 910 |
+
|
| 911 |
+
pdf = _path_must_be_allowed_file(body.pdf_path)
|
| 912 |
+
csv = _path_must_be_allowed_file(body.review_csv_path)
|
| 913 |
+
out_dir: str | None = None
|
| 914 |
+
if body.output_dir is not None:
|
| 915 |
+
out_dir = _path_must_be_allowed_directory(body.output_dir, must_exist=False)
|
| 916 |
+
in_dir: str | None = None
|
| 917 |
+
if body.input_dir is not None:
|
| 918 |
+
in_dir = _path_must_be_allowed_directory(body.input_dir, must_exist=False)
|
| 919 |
+
|
| 920 |
+
try:
|
| 921 |
+
result = run_apply_review_redactions(
|
| 922 |
+
pdf_path=pdf,
|
| 923 |
+
review_csv_path=csv,
|
| 924 |
+
output_dir=out_dir,
|
| 925 |
+
input_dir=in_dir,
|
| 926 |
+
text_extract_method=body.text_extract_method,
|
| 927 |
+
efficient_ocr=body.efficient_ocr,
|
| 928 |
+
)
|
| 929 |
+
except ValueError as e:
|
| 930 |
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
| 931 |
+
except Exception as e:
|
| 932 |
+
raise HTTPException(
|
| 933 |
+
status_code=500,
|
| 934 |
+
detail=f"apply_review_redactions failed: {e}",
|
| 935 |
+
) from e
|
| 936 |
+
|
| 937 |
+
return AgentTaskResponse(
|
| 938 |
+
status="completed",
|
| 939 |
+
gradio_api_name="apply_review_redactions",
|
| 940 |
+
task="apply_review_redactions",
|
| 941 |
+
output_dir=result["output_dir"],
|
| 942 |
+
input_dir=result["input_dir"],
|
| 943 |
+
message=result["message"],
|
| 944 |
+
output_paths=result.get("output_paths"),
|
| 945 |
+
)
|
| 946 |
+
|
| 947 |
+
|
| 948 |
+
@router.post(
|
| 949 |
+
"/verify_redaction_coverage",
|
| 950 |
+
response_model=AgentVerifyRedactionResponse,
|
| 951 |
+
summary="verify_redaction_coverage (Pass 1 programmatic QA)",
|
| 952 |
+
)
|
| 953 |
+
def post_verify_redaction_coverage(
|
| 954 |
+
body: AgentVerifyRedactionRequest,
|
| 955 |
+
_: None = Depends(_optional_agent_api_key),
|
| 956 |
+
) -> AgentVerifyRedactionResponse:
|
| 957 |
+
from tools.simplified_api import run_verify_redaction_coverage
|
| 958 |
+
|
| 959 |
+
review = _path_must_be_allowed_file(body.review_csv_path)
|
| 960 |
+
ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
|
| 961 |
+
redacted = None
|
| 962 |
+
if body.redacted_pdf_path:
|
| 963 |
+
redacted = _path_must_be_allowed_file(body.redacted_pdf_path)
|
| 964 |
+
try:
|
| 965 |
+
report, pruned_csv_path, prune_log = run_verify_redaction_coverage(
|
| 966 |
+
review_csv_path=review,
|
| 967 |
+
ocr_words_csv_path=ocr_words,
|
| 968 |
+
must_redact=body.must_redact,
|
| 969 |
+
must_not_redact=body.must_not_redact,
|
| 970 |
+
redacted_pdf_path=redacted,
|
| 971 |
+
total_pages=body.total_pages,
|
| 972 |
+
min_word_length=body.min_word_length,
|
| 973 |
+
sample_pixels=body.sample_pixels,
|
| 974 |
+
auto_prune_suspicious=body.auto_prune_suspicious,
|
| 975 |
+
pruned_output_path=body.pruned_output_path,
|
| 976 |
+
)
|
| 977 |
+
except ValueError as e:
|
| 978 |
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
| 979 |
+
except Exception as e:
|
| 980 |
+
raise HTTPException(
|
| 981 |
+
status_code=500, detail=f"verify_redaction_coverage failed: {e}"
|
| 982 |
+
) from e
|
| 983 |
+
return AgentVerifyRedactionResponse(
|
| 984 |
+
status="completed",
|
| 985 |
+
coverage_pass=bool(report.get("pass_strict", report.get("pass"))),
|
| 986 |
+
coverage_pass_strict=bool(report.get("pass_strict", report.get("pass"))),
|
| 987 |
+
coverage_pass_with_cleanup=bool(report.get("pass_with_cleanup")),
|
| 988 |
+
pruned_csv_path=pruned_csv_path,
|
| 989 |
+
prune_log=prune_log,
|
| 990 |
+
report=report,
|
| 991 |
+
)
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
@router.post(
|
| 995 |
+
"/word_level_ocr_text_search",
|
| 996 |
+
response_model=AgentWordLevelOcrSearchResponse,
|
| 997 |
+
summary="word_level_ocr_text_search (headless OCR CSV search)",
|
| 998 |
+
)
|
| 999 |
+
def post_word_level_ocr_text_search(
|
| 1000 |
+
body: AgentWordLevelOcrSearchRequest,
|
| 1001 |
+
_: None = Depends(_optional_agent_api_key),
|
| 1002 |
+
) -> AgentWordLevelOcrSearchResponse:
|
| 1003 |
+
from tools.simplified_api import run_word_level_ocr_text_search_api
|
| 1004 |
+
|
| 1005 |
+
ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
|
| 1006 |
+
review = None
|
| 1007 |
+
if body.review_csv_path:
|
| 1008 |
+
review = _path_must_be_allowed_file(body.review_csv_path)
|
| 1009 |
+
try:
|
| 1010 |
+
result = run_word_level_ocr_text_search_api(
|
| 1011 |
+
ocr_words_csv_path=ocr_words,
|
| 1012 |
+
search_text=body.search_text,
|
| 1013 |
+
similarity_threshold=body.similarity_threshold,
|
| 1014 |
+
use_regex=body.use_regex,
|
| 1015 |
+
review_csv_path=review,
|
| 1016 |
+
)
|
| 1017 |
+
except ValueError as e:
|
| 1018 |
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
| 1019 |
+
except Exception as e:
|
| 1020 |
+
raise HTTPException(
|
| 1021 |
+
status_code=500, detail=f"word_level_ocr_text_search failed: {e}"
|
| 1022 |
+
) from e
|
| 1023 |
+
return AgentWordLevelOcrSearchResponse(status="completed", result=result)
|
| 1024 |
+
|
| 1025 |
+
|
| 1026 |
+
@router.get("/operations")
|
| 1027 |
+
def list_operations() -> dict[str, Any]:
|
| 1028 |
+
return {
|
| 1029 |
+
"gradio_api_names": list(GRADIO_API_NAMES),
|
| 1030 |
+
"gradio_session_state_endpoints": {
|
| 1031 |
+
"description": (
|
| 1032 |
+
"These api_name values are exposed on the Gradio HTTP API but return "
|
| 1033 |
+
"501 on /agent because they depend on in-memory Gradio state."
|
| 1034 |
+
),
|
| 1035 |
+
"discover_schema": "GET /gradio_api/info",
|
| 1036 |
+
"call_pattern": 'POST /gradio_api/call/<api_name> with JSON body {"data": [...]}',
|
| 1037 |
+
"names": [
|
| 1038 |
+
"load_and_prepare_documents_or_data",
|
| 1039 |
+
],
|
| 1040 |
+
},
|
| 1041 |
+
"routes": [
|
| 1042 |
+
{
|
| 1043 |
+
"gradio_api_name": "redact_document",
|
| 1044 |
+
"method": "POST",
|
| 1045 |
+
"path": "/agent/redact_document",
|
| 1046 |
+
"implementation": "cli_redact task redact",
|
| 1047 |
+
"notes": {
|
| 1048 |
+
"ocr_method": [
|
| 1049 |
+
"Local OCR",
|
| 1050 |
+
"AWS Textract",
|
| 1051 |
+
"Local text",
|
| 1052 |
+
],
|
| 1053 |
+
"chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
|
| 1054 |
+
"pii_detector_recommended": [
|
| 1055 |
+
LOCAL_PII_OPTION,
|
| 1056 |
+
AWS_PII_OPTION,
|
| 1057 |
+
AWS_LLM_PII_OPTION,
|
| 1058 |
+
INFERENCE_SERVER_PII_OPTION,
|
| 1059 |
+
LOCAL_TRANSFORMERS_LLM_PII_OPTION,
|
| 1060 |
+
"None",
|
| 1061 |
+
],
|
| 1062 |
+
},
|
| 1063 |
+
},
|
| 1064 |
+
{
|
| 1065 |
+
"gradio_api_name": "redact_data",
|
| 1066 |
+
"method": "POST",
|
| 1067 |
+
"path": "/agent/redact_data",
|
| 1068 |
+
"implementation": "cli_redact task redact",
|
| 1069 |
+
"notes": {
|
| 1070 |
+
"ocr_method": [
|
| 1071 |
+
"Local OCR",
|
| 1072 |
+
"AWS Textract",
|
| 1073 |
+
"Local text",
|
| 1074 |
+
],
|
| 1075 |
+
"chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
|
| 1076 |
+
"pii_detector_recommended": [
|
| 1077 |
+
LOCAL_PII_OPTION,
|
| 1078 |
+
AWS_PII_OPTION,
|
| 1079 |
+
AWS_LLM_PII_OPTION,
|
| 1080 |
+
INFERENCE_SERVER_PII_OPTION,
|
| 1081 |
+
LOCAL_TRANSFORMERS_LLM_PII_OPTION,
|
| 1082 |
+
"None",
|
| 1083 |
+
],
|
| 1084 |
+
},
|
| 1085 |
+
},
|
| 1086 |
+
{
|
| 1087 |
+
"gradio_api_name": "find_duplicate_pages",
|
| 1088 |
+
"method": "POST",
|
| 1089 |
+
"path": "/agent/find_duplicate_pages",
|
| 1090 |
+
"implementation": "cli_redact deduplicate pages",
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"gradio_api_name": "find_duplicate_tabular",
|
| 1094 |
+
"method": "POST",
|
| 1095 |
+
"path": "/agent/find_duplicate_tabular",
|
| 1096 |
+
"implementation": "cli_redact deduplicate tabular",
|
| 1097 |
+
},
|
| 1098 |
+
{
|
| 1099 |
+
"gradio_api_name": "summarise_document",
|
| 1100 |
+
"method": "POST",
|
| 1101 |
+
"path": "/agent/summarise_document",
|
| 1102 |
+
"implementation": "cli_redact task summarise",
|
| 1103 |
+
},
|
| 1104 |
+
{
|
| 1105 |
+
"gradio_api_name": "combine_review_pdfs",
|
| 1106 |
+
"method": "POST",
|
| 1107 |
+
"path": "/agent/combine_review_pdfs",
|
| 1108 |
+
"implementation": "cli_redact combine_review_pdfs",
|
| 1109 |
+
},
|
| 1110 |
+
{
|
| 1111 |
+
"gradio_api_name": "export_review_redaction_overlay",
|
| 1112 |
+
"method": "POST",
|
| 1113 |
+
"path": "/agent/export_review_redaction_overlay",
|
| 1114 |
+
"implementation": "visualise_review_redaction_boxes",
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"gradio_api_name": "export_review_page_ocr_visualisation",
|
| 1118 |
+
"method": "POST",
|
| 1119 |
+
"path": "/agent/export_review_page_ocr_visualisation",
|
| 1120 |
+
"implementation": "visualise_ocr_words_bounding_boxes",
|
| 1121 |
+
},
|
| 1122 |
+
{
|
| 1123 |
+
"gradio_api_name": "combine_review_csvs",
|
| 1124 |
+
"method": "POST",
|
| 1125 |
+
"path": "/agent/combine_review_csvs",
|
| 1126 |
+
"implementation": "helper merge_csv_files",
|
| 1127 |
+
},
|
| 1128 |
+
{
|
| 1129 |
+
"gradio_api_name": "load_and_prepare_documents_or_data",
|
| 1130 |
+
"method": "POST",
|
| 1131 |
+
"path": "/agent/load_and_prepare_documents_or_data",
|
| 1132 |
+
"implementation": "not_implemented_http",
|
| 1133 |
+
},
|
| 1134 |
+
{
|
| 1135 |
+
"gradio_api_name": "apply_review_redactions",
|
| 1136 |
+
"method": "POST",
|
| 1137 |
+
"path": "/agent/apply_review_redactions",
|
| 1138 |
+
"implementation": "tools.simplified_api.run_apply_review_redactions",
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"gradio_api_name": "verify_redaction_coverage",
|
| 1142 |
+
"method": "POST",
|
| 1143 |
+
"path": "/agent/verify_redaction_coverage",
|
| 1144 |
+
"implementation": "tools.verify_redaction_coverage.verify_redaction_coverage",
|
| 1145 |
+
"notes": {
|
| 1146 |
+
"purpose": "Pass 1 programmatic QA — pass_strict (policy), pass_with_cleanup (+ suspicious rows), optional prune and text/pixel checks.",
|
| 1147 |
+
"must_redact": "list of regex strings",
|
| 1148 |
+
"must_not_redact": "list of regex strings",
|
| 1149 |
+
"auto_prune_suspicious": "remove short OCR-fragment rows before reporting",
|
| 1150 |
+
"pages_flagged_for_vlm": "policy/visual failures only",
|
| 1151 |
+
"pages_needing_csv_cleanup": "suspicious rows — prune, not VLM",
|
| 1152 |
+
"leak_likely_causes": "per-page hints when text_layer_leaks (coord_not_normalized, missing_page_boxes, etc.) — not a broken /review_apply",
|
| 1153 |
+
},
|
| 1154 |
+
},
|
| 1155 |
+
{
|
| 1156 |
+
"gradio_api_name": "word_level_ocr_text_search",
|
| 1157 |
+
"method": "POST",
|
| 1158 |
+
"path": "/agent/word_level_ocr_text_search",
|
| 1159 |
+
"implementation": "tools.verify_redaction_coverage.run_word_level_ocr_text_search",
|
| 1160 |
+
},
|
| 1161 |
+
],
|
| 1162 |
+
}
|
| 1163 |
+
|
| 1164 |
+
|
| 1165 |
+
@router.get("/health")
|
| 1166 |
+
def agent_health() -> dict[str, str]:
|
| 1167 |
+
return {"status": "ok", "service": "agent"}
|
app.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cdk/__init__.py
ADDED
|
File without changes
|
cdk/app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from aws_cdk import App, Environment
|
| 4 |
+
from cdk_config import (
|
| 5 |
+
AWS_ACCOUNT_ID,
|
| 6 |
+
AWS_REGION,
|
| 7 |
+
CDK_CONTEXT_FILE,
|
| 8 |
+
RUN_USEAST_STACK,
|
| 9 |
+
USE_CLOUDFRONT,
|
| 10 |
+
)
|
| 11 |
+
from cdk_functions import (
|
| 12 |
+
create_basic_config_env,
|
| 13 |
+
load_context_from_file,
|
| 14 |
+
log_aws_credential_context,
|
| 15 |
+
purge_cdk_lookup_context,
|
| 16 |
+
)
|
| 17 |
+
from cdk_stack import CdkStack, CdkStackCloudfront # , CdkStackMain
|
| 18 |
+
from check_resources import CONTEXT_FILE, check_and_set_context
|
| 19 |
+
|
| 20 |
+
# Initialize the CDK app
|
| 21 |
+
app = App()
|
| 22 |
+
|
| 23 |
+
log_aws_credential_context(
|
| 24 |
+
expected_account_id=AWS_ACCOUNT_ID,
|
| 25 |
+
expected_region=AWS_REGION,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Drop stale CDK lookup cache entries (require bootstrap lookup role in target account).
|
| 29 |
+
purge_cdk_lookup_context(CDK_CONTEXT_FILE)
|
| 30 |
+
|
| 31 |
+
# --- Pre-check context (boto3) — written to precheck.context.json, NOT cdk.context.json ---
|
| 32 |
+
print(f"Pre-check context file: {CONTEXT_FILE}")
|
| 33 |
+
print(f"CDK lookup cache file: {CDK_CONTEXT_FILE}")
|
| 34 |
+
if os.path.basename(CONTEXT_FILE.replace("\\", "/")) == os.path.basename(
|
| 35 |
+
CDK_CONTEXT_FILE.replace("\\", "/")
|
| 36 |
+
):
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
f"CONTEXT_FILE and CDK_CONTEXT_FILE must differ (got '{CONTEXT_FILE}' for both). "
|
| 39 |
+
"Set CONTEXT_FILE=precheck.context.json in config/cdk_config.env."
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
print("Running pre-check script to generate application context...")
|
| 43 |
+
try:
|
| 44 |
+
check_and_set_context()
|
| 45 |
+
if not os.path.exists(CONTEXT_FILE):
|
| 46 |
+
raise RuntimeError(
|
| 47 |
+
f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
|
| 48 |
+
)
|
| 49 |
+
print(f"Context generated successfully at {CONTEXT_FILE}.")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
|
| 52 |
+
|
| 53 |
+
# Pre-check must not repopulate CDK lookup keys; purge again if paths were ever shared.
|
| 54 |
+
purge_cdk_lookup_context(CDK_CONTEXT_FILE)
|
| 55 |
+
|
| 56 |
+
if os.path.exists(CONTEXT_FILE):
|
| 57 |
+
load_context_from_file(app, CONTEXT_FILE)
|
| 58 |
+
else:
|
| 59 |
+
raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
|
| 60 |
+
|
| 61 |
+
create_basic_config_env("config")
|
| 62 |
+
|
| 63 |
+
aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
|
| 64 |
+
|
| 65 |
+
regional_stack = CdkStack(
|
| 66 |
+
app, "RedactionStack", env=aws_env_regional, cross_region_references=True
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
|
| 70 |
+
aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
|
| 71 |
+
|
| 72 |
+
cloudfront_stack = CdkStackCloudfront(
|
| 73 |
+
app,
|
| 74 |
+
"RedactionStackCloudfront",
|
| 75 |
+
env=aws_env_us_east_1,
|
| 76 |
+
alb_arn=regional_stack.params["alb_arn_output"],
|
| 77 |
+
alb_sec_group_id=regional_stack.params["alb_security_group_id"],
|
| 78 |
+
alb_dns_name=regional_stack.params["alb_dns_name"],
|
| 79 |
+
cross_region_references=True,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# CDK CLI invokes this script and expects a cloud assembly in cdk.out.
|
| 83 |
+
# Without app.synth(), Python defines constructs but never writes manifest.json
|
| 84 |
+
# (ENOENT on deploy). See: https://github.com/aws/aws-cdk/issues/11023
|
| 85 |
+
app.synth()
|
cdk/cdk.json.example
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"app": "python app.py",
|
| 3 |
+
"output": "cdk.out",
|
| 4 |
+
"context": {
|
| 5 |
+
"@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": false
|
| 6 |
+
}
|
| 7 |
+
}
|
cdk/cdk_config.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Set or retrieve configuration variables for CDK redaction deployment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def convert_string_to_boolean(value: str) -> bool:
|
| 10 |
+
"""Convert string to boolean, handling various formats."""
|
| 11 |
+
if isinstance(value, bool):
|
| 12 |
+
return value
|
| 13 |
+
elif value in ["True", "1", "true", "TRUE"]:
|
| 14 |
+
return True
|
| 15 |
+
elif value in ["False", "0", "false", "FALSE"]:
|
| 16 |
+
return False
|
| 17 |
+
else:
|
| 18 |
+
raise ValueError(f"Invalid boolean value: {value}")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
|
| 22 |
+
"""
|
| 23 |
+
Get an environmental variable, and set it to a default value if it doesn't exist
|
| 24 |
+
"""
|
| 25 |
+
# Get the environment variable if it exists
|
| 26 |
+
value = os.environ.get(var_name)
|
| 27 |
+
|
| 28 |
+
# If it doesn't exist, set the environment variable to the default value
|
| 29 |
+
if value is None:
|
| 30 |
+
os.environ[var_name] = default_value
|
| 31 |
+
value = default_value
|
| 32 |
+
|
| 33 |
+
if print_val is True:
|
| 34 |
+
print(f"The value of {var_name} is {value}")
|
| 35 |
+
|
| 36 |
+
return value
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def ensure_folder_exists(output_folder: str):
|
| 40 |
+
"""Checks if the specified folder exists, creates it if not."""
|
| 41 |
+
|
| 42 |
+
if not os.path.exists(output_folder):
|
| 43 |
+
# Create the folder if it doesn't exist
|
| 44 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 45 |
+
print(f"Created the {output_folder} folder.")
|
| 46 |
+
else:
|
| 47 |
+
print(f"The {output_folder} folder already exists.")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def add_folder_to_path(folder_path: str):
|
| 51 |
+
"""
|
| 52 |
+
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
| 56 |
+
print(folder_path, "folder exists.")
|
| 57 |
+
|
| 58 |
+
# Resolve relative path to absolute path
|
| 59 |
+
absolute_path = os.path.abspath(folder_path)
|
| 60 |
+
|
| 61 |
+
current_path = os.environ["PATH"]
|
| 62 |
+
if absolute_path not in current_path.split(os.pathsep):
|
| 63 |
+
full_path_extension = absolute_path + os.pathsep + current_path
|
| 64 |
+
os.environ["PATH"] = full_path_extension
|
| 65 |
+
# print(f"Updated PATH with: ", full_path_extension)
|
| 66 |
+
else:
|
| 67 |
+
print(f"Directory {folder_path} already exists in PATH.")
|
| 68 |
+
else:
|
| 69 |
+
print(f"Folder not found at {folder_path} - not added to PATH")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
###
|
| 73 |
+
# LOAD CONFIG FROM ENV FILE
|
| 74 |
+
###
|
| 75 |
+
CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
|
| 76 |
+
|
| 77 |
+
ensure_folder_exists(CONFIG_FOLDER)
|
| 78 |
+
|
| 79 |
+
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
|
| 80 |
+
CDK_CONFIG_PATH = get_or_create_env_var(
|
| 81 |
+
"CDK_CONFIG_PATH", "config/cdk_config.env"
|
| 82 |
+
) # e.g. config/cdk_config.env
|
| 83 |
+
|
| 84 |
+
if CDK_CONFIG_PATH:
|
| 85 |
+
if os.path.exists(CDK_CONFIG_PATH):
|
| 86 |
+
print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
|
| 87 |
+
load_dotenv(CDK_CONFIG_PATH)
|
| 88 |
+
else:
|
| 89 |
+
print("CDK config file not found at location:", CDK_CONFIG_PATH)
|
| 90 |
+
|
| 91 |
+
###
|
| 92 |
+
# AWS OPTIONS
|
| 93 |
+
###
|
| 94 |
+
AWS_REGION = get_or_create_env_var("AWS_REGION", "")
|
| 95 |
+
AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
|
| 96 |
+
|
| 97 |
+
###
|
| 98 |
+
# CDK OPTIONS
|
| 99 |
+
###
|
| 100 |
+
CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
|
| 101 |
+
_precheck_context_file = get_or_create_env_var("CONTEXT_FILE", "precheck.context.json")
|
| 102 |
+
# Never write boto3 pre-check output into CDK's lookup cache file (causes stale
|
| 103 |
+
# vpc-provider / load-balancer entries and wrong-account lookup validation errors).
|
| 104 |
+
if os.path.basename(_precheck_context_file.replace("\\", "/")) == "cdk.context.json":
|
| 105 |
+
print(
|
| 106 |
+
"WARNING: CONTEXT_FILE must not be 'cdk.context.json' (that file is CDK's "
|
| 107 |
+
"lookup cache). Using 'precheck.context.json' instead. Update "
|
| 108 |
+
"config/cdk_config.env and remove CONTEXT_FILE=cdk.context.json if set."
|
| 109 |
+
)
|
| 110 |
+
_precheck_context_file = "precheck.context.json"
|
| 111 |
+
CONTEXT_FILE = _precheck_context_file
|
| 112 |
+
CDK_CONTEXT_FILE = get_or_create_env_var("CDK_CONTEXT_FILE", "cdk.context.json")
|
| 113 |
+
CDK_FOLDER = get_or_create_env_var(
|
| 114 |
+
"CDK_FOLDER", ""
|
| 115 |
+
) # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
|
| 116 |
+
RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
|
| 117 |
+
|
| 118 |
+
### VPC and connections
|
| 119 |
+
VPC_NAME = get_or_create_env_var("VPC_NAME", "")
|
| 120 |
+
NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
|
| 121 |
+
NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "") # "10.0.0.0/24"
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
|
| 125 |
+
SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
|
| 126 |
+
|
| 127 |
+
### SUBNETS / ROUTE TABLES / NAT GATEWAY
|
| 128 |
+
PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
|
| 129 |
+
"PUBLIC_SUBNETS_TO_USE", ""
|
| 130 |
+
) # e.g. ['PublicSubnet1', 'PublicSubnet2']
|
| 131 |
+
PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
|
| 132 |
+
"PUBLIC_SUBNET_CIDR_BLOCKS", ""
|
| 133 |
+
) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
|
| 134 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
|
| 135 |
+
"PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
|
| 136 |
+
) # e.g. ["eu-east-1b", "eu-east1b"]
|
| 137 |
+
|
| 138 |
+
PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
|
| 139 |
+
"PRIVATE_SUBNETS_TO_USE", ""
|
| 140 |
+
) # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
|
| 141 |
+
PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
|
| 142 |
+
"PRIVATE_SUBNET_CIDR_BLOCKS", ""
|
| 143 |
+
) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
|
| 144 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
|
| 145 |
+
"PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
|
| 146 |
+
) # e.g. ["eu-east-1b", "eu-east1b"]
|
| 147 |
+
|
| 148 |
+
ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
|
| 149 |
+
"ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
|
| 150 |
+
)
|
| 151 |
+
NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
|
| 152 |
+
"NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
|
| 153 |
+
)
|
| 154 |
+
NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
|
| 155 |
+
|
| 156 |
+
# IAM roles
|
| 157 |
+
AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
|
| 158 |
+
"AWS_MANAGED_TASK_ROLES_LIST",
|
| 159 |
+
'["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs", "AmazonBedrockFullAccess"]',
|
| 160 |
+
)
|
| 161 |
+
POLICY_FILE_LOCATIONS = get_or_create_env_var(
|
| 162 |
+
"POLICY_FILE_LOCATIONS", ""
|
| 163 |
+
) # e.g. '["config/sts_permissions.json"]'
|
| 164 |
+
POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
|
| 165 |
+
|
| 166 |
+
# GITHUB REPO
|
| 167 |
+
GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
|
| 168 |
+
GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
|
| 169 |
+
GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
|
| 170 |
+
|
| 171 |
+
### CODEBUILD
|
| 172 |
+
CODEBUILD_ROLE_NAME = get_or_create_env_var(
|
| 173 |
+
"CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
|
| 174 |
+
)
|
| 175 |
+
CODEBUILD_PROJECT_NAME = get_or_create_env_var(
|
| 176 |
+
"CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
### ECR
|
| 180 |
+
ECR_REPO_NAME = get_or_create_env_var(
|
| 181 |
+
"ECR_REPO_NAME", "doc-redaction"
|
| 182 |
+
) # Beware - cannot have underscores and must be lower case
|
| 183 |
+
ECR_CDK_REPO_NAME = get_or_create_env_var(
|
| 184 |
+
"ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
### S3
|
| 188 |
+
S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
|
| 189 |
+
"S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
|
| 190 |
+
) # S3 bucket names need to be lower case
|
| 191 |
+
S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
|
| 192 |
+
"S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
### KMS KEYS FOR S3 AND SECRETS MANAGER
|
| 196 |
+
USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
|
| 197 |
+
CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
|
| 198 |
+
"CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
### ECS
|
| 202 |
+
FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
|
| 203 |
+
"FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
|
| 204 |
+
)
|
| 205 |
+
TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
|
| 206 |
+
"TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
|
| 210 |
+
ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
|
| 211 |
+
ECS_TASK_ROLE_NAME = get_or_create_env_var(
|
| 212 |
+
"ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
|
| 213 |
+
)
|
| 214 |
+
ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
|
| 215 |
+
"ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
|
| 216 |
+
)
|
| 217 |
+
ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
|
| 218 |
+
"ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
|
| 219 |
+
)
|
| 220 |
+
ECS_LOG_GROUP_NAME = get_or_create_env_var(
|
| 221 |
+
"ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
|
| 225 |
+
ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
|
| 226 |
+
ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
|
| 227 |
+
ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
|
| 228 |
+
|
| 229 |
+
### Cognito
|
| 230 |
+
COGNITO_USER_POOL_NAME = get_or_create_env_var(
|
| 231 |
+
"COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
|
| 232 |
+
)
|
| 233 |
+
COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
|
| 234 |
+
"COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
|
| 235 |
+
)
|
| 236 |
+
COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
|
| 237 |
+
"COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
|
| 238 |
+
)
|
| 239 |
+
COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
|
| 240 |
+
"COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
|
| 241 |
+
) # Should change this to something unique or you'll probably hit an error
|
| 242 |
+
|
| 243 |
+
COGNITO_REFRESH_TOKEN_VALIDITY = int(
|
| 244 |
+
get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
|
| 245 |
+
) # Minutes
|
| 246 |
+
COGNITO_ID_TOKEN_VALIDITY = int(
|
| 247 |
+
get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
|
| 248 |
+
) # Minutes
|
| 249 |
+
COGNITO_ACCESS_TOKEN_VALIDITY = int(
|
| 250 |
+
get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
|
| 251 |
+
) # Minutes
|
| 252 |
+
|
| 253 |
+
# Application load balancer
|
| 254 |
+
ALB_NAME = get_or_create_env_var(
|
| 255 |
+
"ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
|
| 256 |
+
) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
|
| 257 |
+
ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
|
| 258 |
+
"ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
|
| 259 |
+
)
|
| 260 |
+
ALB_TARGET_GROUP_NAME = get_or_create_env_var(
|
| 261 |
+
"ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
|
| 262 |
+
) # Max 32 characters
|
| 263 |
+
EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
|
| 264 |
+
EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
|
| 265 |
+
"EXISTING_LOAD_BALANCER_DNS", "placeholder_load_balancer_dns.net"
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
## CLOUDFRONT
|
| 269 |
+
USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
|
| 270 |
+
CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
|
| 271 |
+
"CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
|
| 272 |
+
)
|
| 273 |
+
CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
|
| 274 |
+
"CLOUDFRONT_GEO_RESTRICTION", ""
|
| 275 |
+
) # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
|
| 276 |
+
CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
|
| 277 |
+
"CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
|
| 278 |
+
)
|
| 279 |
+
CLOUDFRONT_DOMAIN = get_or_create_env_var(
|
| 280 |
+
"CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
# Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
|
| 285 |
+
ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
|
| 286 |
+
SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
|
| 287 |
+
"SSL_CERTIFICATE_DOMAIN", ""
|
| 288 |
+
) # e.g. example.com or www.example.com
|
| 289 |
+
|
| 290 |
+
# This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
|
| 291 |
+
if USE_CLOUDFRONT == "True":
|
| 292 |
+
COGNITO_REDIRECTION_URL = get_or_create_env_var(
|
| 293 |
+
"COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
|
| 294 |
+
)
|
| 295 |
+
elif SSL_CERTIFICATE_DOMAIN:
|
| 296 |
+
COGNITO_REDIRECTION_URL = get_or_create_env_var(
|
| 297 |
+
"COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
|
| 298 |
+
)
|
| 299 |
+
else:
|
| 300 |
+
COGNITO_REDIRECTION_URL = get_or_create_env_var(
|
| 301 |
+
"COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Custom headers e.g. if routing traffic through Cloudfront
|
| 305 |
+
CUSTOM_HEADER = get_or_create_env_var(
|
| 306 |
+
"CUSTOM_HEADER", ""
|
| 307 |
+
) # Retrieving or setting CUSTOM_HEADER
|
| 308 |
+
CUSTOM_HEADER_VALUE = get_or_create_env_var(
|
| 309 |
+
"CUSTOM_HEADER_VALUE", ""
|
| 310 |
+
) # Retrieving or setting CUSTOM_HEADER_VALUE
|
| 311 |
+
|
| 312 |
+
# Firewall on top of load balancer
|
| 313 |
+
LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
|
| 314 |
+
"LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Firewall on top of CloudFront
|
| 318 |
+
WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
|
| 319 |
+
|
| 320 |
+
###
|
| 321 |
+
# File I/O options
|
| 322 |
+
###
|
| 323 |
+
|
| 324 |
+
OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") # 'output/'
|
| 325 |
+
INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") # 'input/'
|
| 326 |
+
|
| 327 |
+
# Allow for files to be saved in a temporary folder for increased security in some instances
|
| 328 |
+
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
| 329 |
+
# Create a temporary directory
|
| 330 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 331 |
+
print(f"Temporary directory created at: {temp_dir}")
|
| 332 |
+
|
| 333 |
+
if OUTPUT_FOLDER == "TEMP":
|
| 334 |
+
OUTPUT_FOLDER = temp_dir + "/"
|
| 335 |
+
if INPUT_FOLDER == "TEMP":
|
| 336 |
+
INPUT_FOLDER = temp_dir + "/"
|
| 337 |
+
|
| 338 |
+
###
|
| 339 |
+
# LOGGING OPTIONS
|
| 340 |
+
###
|
| 341 |
+
|
| 342 |
+
SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
|
| 343 |
+
|
| 344 |
+
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
| 345 |
+
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
|
| 346 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
| 347 |
+
"ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
|
| 348 |
+
)
|
| 349 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
| 350 |
+
"FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
|
| 351 |
+
)
|
| 352 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
| 353 |
+
"USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
###
|
| 357 |
+
# REDACTION OPTIONS
|
| 358 |
+
###
|
| 359 |
+
|
| 360 |
+
# Get some environment variables and Launch the Gradio app
|
| 361 |
+
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
|
| 362 |
+
|
| 363 |
+
GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
|
| 364 |
+
|
| 365 |
+
###
|
| 366 |
+
# WHOLE DOCUMENT API OPTIONS
|
| 367 |
+
###
|
| 368 |
+
|
| 369 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
|
| 370 |
+
"DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
|
| 371 |
+
) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
cdk/cdk_functions.py
ADDED
|
@@ -0,0 +1,1665 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ipaddress
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
import boto3
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from aws_cdk import App, CfnOutput, CfnTag, Tags
|
| 9 |
+
from aws_cdk import aws_cognito as cognito
|
| 10 |
+
from aws_cdk import aws_ec2 as ec2
|
| 11 |
+
from aws_cdk import aws_elasticloadbalancingv2 as elb
|
| 12 |
+
from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
|
| 13 |
+
from aws_cdk import aws_iam as iam
|
| 14 |
+
from aws_cdk import aws_wafv2 as wafv2
|
| 15 |
+
from botocore.exceptions import ClientError, NoCredentialsError
|
| 16 |
+
from cdk_config import (
|
| 17 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME,
|
| 18 |
+
AWS_REGION,
|
| 19 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
|
| 20 |
+
NAT_GATEWAY_EIP_NAME,
|
| 21 |
+
POLICY_FILE_LOCATIONS,
|
| 22 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES,
|
| 23 |
+
PRIVATE_SUBNET_CIDR_BLOCKS,
|
| 24 |
+
PRIVATE_SUBNETS_TO_USE,
|
| 25 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES,
|
| 26 |
+
PUBLIC_SUBNET_CIDR_BLOCKS,
|
| 27 |
+
PUBLIC_SUBNETS_TO_USE,
|
| 28 |
+
S3_LOG_CONFIG_BUCKET_NAME,
|
| 29 |
+
S3_OUTPUT_BUCKET_NAME,
|
| 30 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME,
|
| 31 |
+
)
|
| 32 |
+
from constructs import Construct
|
| 33 |
+
from dotenv import set_key
|
| 34 |
+
|
| 35 |
+
# CDK CLI stores lookup-provider results under these key prefixes in cdk.context.json.
|
| 36 |
+
_CDK_LOOKUP_CONTEXT_PREFIXES = (
|
| 37 |
+
"vpc-provider:",
|
| 38 |
+
"load-balancer:",
|
| 39 |
+
"availability-zones:",
|
| 40 |
+
"hosted-zone:",
|
| 41 |
+
"security-group:",
|
| 42 |
+
"key-provider:",
|
| 43 |
+
"ami:",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def purge_cdk_lookup_context(file_path: str) -> int:
|
| 48 |
+
"""Remove stale CDK lookup cache entries that require the bootstrap lookup role."""
|
| 49 |
+
if not os.path.exists(file_path):
|
| 50 |
+
return 0
|
| 51 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 52 |
+
context_data = json.load(f)
|
| 53 |
+
cleaned = {
|
| 54 |
+
key: value
|
| 55 |
+
for key, value in context_data.items()
|
| 56 |
+
if not key.startswith(_CDK_LOOKUP_CONTEXT_PREFIXES)
|
| 57 |
+
}
|
| 58 |
+
removed = len(context_data) - len(cleaned)
|
| 59 |
+
if removed:
|
| 60 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 61 |
+
json.dump(cleaned, f, indent=2)
|
| 62 |
+
print(f"Removed {removed} stale CDK lookup context key(s) from {file_path}.")
|
| 63 |
+
return removed
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def log_aws_credential_context(
|
| 67 |
+
expected_account_id: Optional[str] = None,
|
| 68 |
+
expected_region: Optional[str] = None,
|
| 69 |
+
) -> Dict[str, Any]:
|
| 70 |
+
"""
|
| 71 |
+
Print the active AWS identity and non-secret credential hints for CDK debugging.
|
| 72 |
+
|
| 73 |
+
Helps distinguish SSO/assumed-role sessions from long-lived access keys in
|
| 74 |
+
~/.aws/credentials or environment variables.
|
| 75 |
+
"""
|
| 76 |
+
profile = os.environ.get("AWS_PROFILE") or "(not set — using default profile chain)"
|
| 77 |
+
default_region = (
|
| 78 |
+
os.environ.get("AWS_REGION")
|
| 79 |
+
or os.environ.get("AWS_DEFAULT_REGION")
|
| 80 |
+
or "(not set in environment)"
|
| 81 |
+
)
|
| 82 |
+
env_access_key_set = bool(os.environ.get("AWS_ACCESS_KEY_ID"))
|
| 83 |
+
env_secret_key_set = bool(os.environ.get("AWS_SECRET_ACCESS_KEY"))
|
| 84 |
+
env_session_token_set = bool(os.environ.get("AWS_SESSION_TOKEN"))
|
| 85 |
+
|
| 86 |
+
print("\n--- AWS credential context (CDK / boto3) ---")
|
| 87 |
+
print(f"AWS_PROFILE: {profile}")
|
| 88 |
+
print(f"AWS_REGION / AWS_DEFAULT_REGION (env): {default_region}")
|
| 89 |
+
print(
|
| 90 |
+
"Environment credential variables: "
|
| 91 |
+
f"AWS_ACCESS_KEY_ID={'set' if env_access_key_set else 'not set'}, "
|
| 92 |
+
f"AWS_SECRET_ACCESS_KEY={'set' if env_secret_key_set else 'not set'}, "
|
| 93 |
+
f"AWS_SESSION_TOKEN={'set' if env_session_token_set else 'not set'}"
|
| 94 |
+
)
|
| 95 |
+
if expected_account_id:
|
| 96 |
+
print(f"Configured CDK target account (AWS_ACCOUNT_ID): {expected_account_id}")
|
| 97 |
+
if expected_region:
|
| 98 |
+
print(f"Configured CDK target region (AWS_REGION): {expected_region}")
|
| 99 |
+
|
| 100 |
+
session = boto3.Session()
|
| 101 |
+
active_profile = session.profile_name or "(default)"
|
| 102 |
+
print(f"boto3 session profile: {active_profile}")
|
| 103 |
+
print(f"boto3 session region: {session.region_name or '(not set)'}")
|
| 104 |
+
|
| 105 |
+
credentials = session.get_credentials()
|
| 106 |
+
credential_summary: Dict[str, Any] = {
|
| 107 |
+
"profile": profile,
|
| 108 |
+
"session_profile": active_profile,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
if credentials is None:
|
| 112 |
+
print("WARNING: No AWS credentials found in the default provider chain.")
|
| 113 |
+
print("--- End AWS credential context ---\n")
|
| 114 |
+
credential_summary["error"] = "no_credentials"
|
| 115 |
+
return credential_summary
|
| 116 |
+
|
| 117 |
+
frozen = credentials.get_frozen_credentials()
|
| 118 |
+
access_key = frozen.access_key or ""
|
| 119 |
+
access_key_prefix = (access_key[:4] + "...") if len(access_key) >= 4 else "(none)"
|
| 120 |
+
credential_summary["access_key_prefix"] = access_key_prefix
|
| 121 |
+
|
| 122 |
+
if env_access_key_set:
|
| 123 |
+
credential_source = "environment variables (highest precedence)"
|
| 124 |
+
elif access_key.startswith("AKIA"):
|
| 125 |
+
credential_source = "long-lived access key (likely ~/.aws/credentials [default] or named profile)"
|
| 126 |
+
elif access_key.startswith("ASIA"):
|
| 127 |
+
credential_source = "temporary credentials (SSO, assumed role, or STS session)"
|
| 128 |
+
else:
|
| 129 |
+
credential_source = (
|
| 130 |
+
"resolved credentials (source could not be classified from key prefix)"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
print(f"Inferred credential type: {credential_source}")
|
| 134 |
+
credential_summary["inferred_credential_type"] = credential_source
|
| 135 |
+
|
| 136 |
+
if env_access_key_set and profile != "(not set — using default profile chain)":
|
| 137 |
+
print(
|
| 138 |
+
"NOTE: AWS_ACCESS_KEY_ID is set in the environment, so it overrides "
|
| 139 |
+
f"profile '{profile}' and SSO."
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
sts = session.client("sts", region_name=session.region_name or expected_region)
|
| 144 |
+
identity = sts.get_caller_identity()
|
| 145 |
+
except (ClientError, NoCredentialsError) as exc:
|
| 146 |
+
print(f"WARNING: sts:GetCallerIdentity failed: {exc}")
|
| 147 |
+
print("--- End AWS credential context ---\n")
|
| 148 |
+
credential_summary["error"] = str(exc)
|
| 149 |
+
return credential_summary
|
| 150 |
+
|
| 151 |
+
account = identity.get("Account", "")
|
| 152 |
+
arn = identity.get("Arn", "")
|
| 153 |
+
user_id = identity.get("UserId", "")
|
| 154 |
+
|
| 155 |
+
print(f"Caller account: {account}")
|
| 156 |
+
print(f"Caller ARN: {arn}")
|
| 157 |
+
print(f"Caller UserId: {user_id}")
|
| 158 |
+
|
| 159 |
+
if ":assumed-role/" in arn:
|
| 160 |
+
principal_kind = "assumed IAM role (typical for SSO or role chaining)"
|
| 161 |
+
elif ":user/" in arn:
|
| 162 |
+
principal_kind = "IAM user (typical for static access keys in credentials file)"
|
| 163 |
+
elif ":federated-user/" in arn:
|
| 164 |
+
principal_kind = "federated user"
|
| 165 |
+
else:
|
| 166 |
+
principal_kind = "other IAM principal"
|
| 167 |
+
|
| 168 |
+
print(f"Principal kind: {principal_kind}")
|
| 169 |
+
credential_summary.update(
|
| 170 |
+
{
|
| 171 |
+
"account": account,
|
| 172 |
+
"arn": arn,
|
| 173 |
+
"user_id": user_id,
|
| 174 |
+
"principal_kind": principal_kind,
|
| 175 |
+
}
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
if expected_account_id and account and account != str(expected_account_id):
|
| 179 |
+
print(
|
| 180 |
+
"WARNING: Caller account does not match configured AWS_ACCOUNT_ID. "
|
| 181 |
+
"CDK will target the configured account but act as this identity — "
|
| 182 |
+
"deployments and lookups may fail. Set AWS_PROFILE to your SSO profile "
|
| 183 |
+
"and unset AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY if needed."
|
| 184 |
+
)
|
| 185 |
+
credential_summary["account_mismatch"] = True
|
| 186 |
+
elif expected_account_id and account == str(expected_account_id):
|
| 187 |
+
print("Caller account matches configured AWS_ACCOUNT_ID.")
|
| 188 |
+
|
| 189 |
+
if profile == "(not set — using default profile chain)":
|
| 190 |
+
print(
|
| 191 |
+
"TIP: Set AWS_PROFILE to your SSO profile name so Python and the CDK CLI "
|
| 192 |
+
"(Node) use the same session. Example: "
|
| 193 |
+
'$env:AWS_PROFILE = "YourSsoProfileName"'
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
print("--- End AWS credential context ---\n")
|
| 197 |
+
return credential_summary
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# --- Function to load context from file ---
|
| 201 |
+
def load_context_from_file(app: App, file_path: str):
|
| 202 |
+
if os.path.exists(file_path):
|
| 203 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 204 |
+
context_data = json.load(f)
|
| 205 |
+
for key, value in context_data.items():
|
| 206 |
+
app.node.set_context(key, value)
|
| 207 |
+
print(f"Loaded context from {file_path}")
|
| 208 |
+
else:
|
| 209 |
+
print(f"Context file not found: {file_path}")
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# --- Helper to parse environment variables into lists ---
|
| 213 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
| 214 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
| 215 |
+
value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
|
| 216 |
+
if not value:
|
| 217 |
+
return []
|
| 218 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
| 219 |
+
return [s.strip() for s in value.split(",") if s.strip()]
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# 1. Try to load CIDR/AZs from environment variables
|
| 223 |
+
if PUBLIC_SUBNETS_TO_USE:
|
| 224 |
+
PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
|
| 225 |
+
if PRIVATE_SUBNETS_TO_USE:
|
| 226 |
+
PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
|
| 227 |
+
|
| 228 |
+
if PUBLIC_SUBNET_CIDR_BLOCKS:
|
| 229 |
+
PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
|
| 230 |
+
if PUBLIC_SUBNET_AVAILABILITY_ZONES:
|
| 231 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
|
| 232 |
+
if PRIVATE_SUBNET_CIDR_BLOCKS:
|
| 233 |
+
PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
|
| 234 |
+
if PRIVATE_SUBNET_AVAILABILITY_ZONES:
|
| 235 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
|
| 236 |
+
"PRIVATE_SUBNET_AVAILABILITY_ZONES"
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
if POLICY_FILE_LOCATIONS:
|
| 240 |
+
POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def check_for_existing_role(role_name: str):
|
| 244 |
+
try:
|
| 245 |
+
iam = boto3.client("iam")
|
| 246 |
+
# iam.get_role(RoleName=role_name)
|
| 247 |
+
|
| 248 |
+
response = iam.get_role(RoleName=role_name)
|
| 249 |
+
role = response["Role"]["Arn"]
|
| 250 |
+
|
| 251 |
+
print("Response Role:", role)
|
| 252 |
+
|
| 253 |
+
return True, role, ""
|
| 254 |
+
except iam.exceptions.NoSuchEntityException:
|
| 255 |
+
return False, "", ""
|
| 256 |
+
except Exception as e:
|
| 257 |
+
raise Exception("Getting information on IAM role failed due to:", e)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
from typing import List
|
| 261 |
+
|
| 262 |
+
# Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
|
| 263 |
+
# For example:
|
| 264 |
+
# POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
|
| 268 |
+
"""
|
| 269 |
+
Adds individual policy statements from a parsed policy document to a CDK Role.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
role: The CDK Role construct to attach policies to.
|
| 273 |
+
policy_document: A Python dictionary representing an IAM policy document.
|
| 274 |
+
"""
|
| 275 |
+
# Ensure the loaded JSON is a valid policy document structure
|
| 276 |
+
if "Statement" not in policy_document or not isinstance(
|
| 277 |
+
policy_document["Statement"], list
|
| 278 |
+
):
|
| 279 |
+
print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
|
| 280 |
+
return # Do not return role, just log and exit
|
| 281 |
+
|
| 282 |
+
for statement_dict in policy_document["Statement"]:
|
| 283 |
+
try:
|
| 284 |
+
# Create a CDK PolicyStatement from the dictionary
|
| 285 |
+
cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
|
| 286 |
+
|
| 287 |
+
# Add the policy statement to the role
|
| 288 |
+
role.add_to_policy(cdk_policy_statement)
|
| 289 |
+
print(f" - Added statement: {statement_dict.get('Sid', 'No Sid')}")
|
| 290 |
+
except Exception as e:
|
| 291 |
+
print(
|
| 292 |
+
f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def add_custom_policies(
|
| 297 |
+
scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
|
| 298 |
+
role: iam.IRole,
|
| 299 |
+
policy_file_locations: Optional[List[str]] = None,
|
| 300 |
+
custom_policy_text: Optional[str] = None,
|
| 301 |
+
) -> iam.IRole:
|
| 302 |
+
"""
|
| 303 |
+
Loads custom policies from JSON files or a string and attaches them to a CDK Role.
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
|
| 307 |
+
role: The CDK Role construct to attach policies to.
|
| 308 |
+
policy_file_locations: List of file paths to JSON policy documents.
|
| 309 |
+
custom_policy_text: A JSON string representing a policy document.
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
The modified CDK Role construct.
|
| 313 |
+
"""
|
| 314 |
+
if policy_file_locations is None:
|
| 315 |
+
policy_file_locations = []
|
| 316 |
+
|
| 317 |
+
current_source = "unknown source" # For error messages
|
| 318 |
+
|
| 319 |
+
try:
|
| 320 |
+
if policy_file_locations:
|
| 321 |
+
print(f"Attempting to add policies from files to role {role.node.id}...")
|
| 322 |
+
for path in policy_file_locations:
|
| 323 |
+
current_source = f"file: {path}"
|
| 324 |
+
try:
|
| 325 |
+
with open(path, "r") as f:
|
| 326 |
+
policy_document = json.load(f)
|
| 327 |
+
print(f"Processing policy from {current_source}...")
|
| 328 |
+
add_statement_to_policy(role, policy_document)
|
| 329 |
+
except FileNotFoundError:
|
| 330 |
+
print(f"Warning: Policy file not found at {path}. Skipping.")
|
| 331 |
+
except json.JSONDecodeError as e:
|
| 332 |
+
print(
|
| 333 |
+
f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
|
| 334 |
+
)
|
| 335 |
+
except Exception as e:
|
| 336 |
+
print(
|
| 337 |
+
f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
if custom_policy_text:
|
| 341 |
+
current_source = "custom policy text string"
|
| 342 |
+
print(
|
| 343 |
+
f"Attempting to add policy from custom text to role {role.node.id}..."
|
| 344 |
+
)
|
| 345 |
+
try:
|
| 346 |
+
# *** FIX: Parse the JSON string into a Python dictionary ***
|
| 347 |
+
policy_document = json.loads(custom_policy_text)
|
| 348 |
+
print(f"Processing policy from {current_source}...")
|
| 349 |
+
add_statement_to_policy(role, policy_document)
|
| 350 |
+
except json.JSONDecodeError as e:
|
| 351 |
+
print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(
|
| 354 |
+
f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
# You might want a final success message, but individual processing messages are also good.
|
| 358 |
+
print(f"Finished processing custom policies for role {role.node.id}.")
|
| 359 |
+
|
| 360 |
+
except Exception as e:
|
| 361 |
+
print(
|
| 362 |
+
f"An unhandled error occurred during policy addition for {current_source}: {e}"
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
return role
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
# Import the S3 Bucket class if you intend to return a CDK object later
|
| 369 |
+
# from aws_cdk import aws_s3 as s3
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def check_s3_bucket_exists(
|
| 373 |
+
bucket_name: str,
|
| 374 |
+
): # Return type hint depends on what you return
|
| 375 |
+
"""
|
| 376 |
+
Checks if an S3 bucket with the given name exists and is accessible.
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
bucket_name: The name of the S3 bucket to check.
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
A tuple: (bool indicating existence, optional S3 Bucket object or None)
|
| 383 |
+
Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
|
| 384 |
+
for direct use in CDK. You'll likely only need the boolean result
|
| 385 |
+
or the bucket name for CDK lookups/creations.
|
| 386 |
+
For this example, let's return the boolean and the name.
|
| 387 |
+
"""
|
| 388 |
+
s3_client = boto3.client("s3")
|
| 389 |
+
try:
|
| 390 |
+
# Use head_bucket to check for existence and access
|
| 391 |
+
s3_client.head_bucket(Bucket=bucket_name)
|
| 392 |
+
print(f"Bucket '{bucket_name}' exists and is accessible.")
|
| 393 |
+
return True, bucket_name # Return True and the bucket name
|
| 394 |
+
|
| 395 |
+
except ClientError as e:
|
| 396 |
+
# If a ClientError occurs, check the error code.
|
| 397 |
+
# '404' means the bucket does not exist.
|
| 398 |
+
# '403' means the bucket exists but you don't have permission.
|
| 399 |
+
error_code = e.response["Error"]["Code"]
|
| 400 |
+
if error_code == "404":
|
| 401 |
+
print(f"Bucket '{bucket_name}' does not exist.")
|
| 402 |
+
return False, None
|
| 403 |
+
elif error_code == "403":
|
| 404 |
+
# The bucket exists, but you can't access it.
|
| 405 |
+
# Depending on your requirements, this might be treated as "exists"
|
| 406 |
+
# or "not accessible for our purpose". For checking existence,
|
| 407 |
+
# we'll say it exists here, but note the permission issue.
|
| 408 |
+
# NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
|
| 409 |
+
print(
|
| 410 |
+
f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
|
| 411 |
+
)
|
| 412 |
+
return False, bucket_name # It exists, even if not accessible
|
| 413 |
+
else:
|
| 414 |
+
# For other errors, it's better to raise the exception
|
| 415 |
+
# to indicate something unexpected happened.
|
| 416 |
+
print(
|
| 417 |
+
f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
|
| 418 |
+
)
|
| 419 |
+
# Decide how to handle other errors - raising might be safer
|
| 420 |
+
raise # Re-raise the original exception
|
| 421 |
+
except Exception as e:
|
| 422 |
+
print(
|
| 423 |
+
f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
|
| 424 |
+
)
|
| 425 |
+
# Decide how to handle other errors
|
| 426 |
+
raise # Re-raise the original exception
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
# Example usage in your check_resources.py:
|
| 430 |
+
# exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
|
| 431 |
+
# context_data[f"exists:{log_bucket_name}"] = exists
|
| 432 |
+
# # You don't necessarily need to store the name in context if using from_bucket_name
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
# Delete an S3 bucket
|
| 436 |
+
def delete_s3_bucket(bucket_name: str):
|
| 437 |
+
s3 = boto3.client("s3")
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
# List and delete all objects
|
| 441 |
+
response = s3.list_object_versions(Bucket=bucket_name)
|
| 442 |
+
versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
|
| 443 |
+
for version in versions:
|
| 444 |
+
s3.delete_object(
|
| 445 |
+
Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
# Delete the bucket
|
| 449 |
+
s3.delete_bucket(Bucket=bucket_name)
|
| 450 |
+
return {"Status": "SUCCESS"}
|
| 451 |
+
except Exception as e:
|
| 452 |
+
return {"Status": "FAILED", "Reason": str(e)}
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
# Function to get subnet ID from subnet name
|
| 456 |
+
def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
|
| 457 |
+
response = ec2_client.describe_subnets(
|
| 458 |
+
Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
for subnet in response["Subnets"]:
|
| 462 |
+
if subnet["Tags"] and any(
|
| 463 |
+
tag["Key"] == "Name" and tag["Value"] == subnet_name
|
| 464 |
+
for tag in subnet["Tags"]
|
| 465 |
+
):
|
| 466 |
+
return subnet["SubnetId"]
|
| 467 |
+
|
| 468 |
+
return None
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
|
| 472 |
+
"""
|
| 473 |
+
Checks if an ECR repository with the given name exists.
|
| 474 |
+
|
| 475 |
+
Args:
|
| 476 |
+
repo_name: The name of the ECR repository to check.
|
| 477 |
+
|
| 478 |
+
Returns:
|
| 479 |
+
True if the repository exists, False otherwise.
|
| 480 |
+
"""
|
| 481 |
+
ecr_client = boto3.client("ecr")
|
| 482 |
+
try:
|
| 483 |
+
print("ecr repo_name to check:", repo_name)
|
| 484 |
+
response = ecr_client.describe_repositories(repositoryNames=[repo_name])
|
| 485 |
+
# If describe_repositories succeeds and returns a list of repositories,
|
| 486 |
+
# and the list is not empty, the repository exists.
|
| 487 |
+
return len(response["repositories"]) > 0, response["repositories"][0]
|
| 488 |
+
except ClientError as e:
|
| 489 |
+
# Check for the specific error code indicating the repository doesn't exist
|
| 490 |
+
if e.response["Error"]["Code"] == "RepositoryNotFoundException":
|
| 491 |
+
return False, {}
|
| 492 |
+
else:
|
| 493 |
+
# Re-raise other exceptions to handle unexpected errors
|
| 494 |
+
raise
|
| 495 |
+
except Exception as e:
|
| 496 |
+
print(f"An unexpected error occurred: {e}")
|
| 497 |
+
return False, {}
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def check_codebuild_project_exists(
|
| 501 |
+
project_name: str,
|
| 502 |
+
): # Adjust return type hint as needed
|
| 503 |
+
"""
|
| 504 |
+
Checks if a CodeBuild project with the given name exists.
|
| 505 |
+
|
| 506 |
+
Args:
|
| 507 |
+
project_name: The name of the CodeBuild project to check.
|
| 508 |
+
|
| 509 |
+
Returns:
|
| 510 |
+
A tuple:
|
| 511 |
+
- The first element is True if the project exists, False otherwise.
|
| 512 |
+
- The second element is the project object (dictionary) if found,
|
| 513 |
+
None otherwise.
|
| 514 |
+
"""
|
| 515 |
+
codebuild_client = boto3.client("codebuild")
|
| 516 |
+
try:
|
| 517 |
+
# Use batch_get_projects with a list containing the single project name
|
| 518 |
+
response = codebuild_client.batch_get_projects(names=[project_name])
|
| 519 |
+
|
| 520 |
+
# The response for batch_get_projects includes 'projects' (found)
|
| 521 |
+
# and 'projectsNotFound' (not found).
|
| 522 |
+
if response["projects"]:
|
| 523 |
+
# If the project is found in the 'projects' list
|
| 524 |
+
print(f"CodeBuild project '{project_name}' found.")
|
| 525 |
+
project = response["projects"][0]
|
| 526 |
+
return (
|
| 527 |
+
True,
|
| 528 |
+
project["arn"],
|
| 529 |
+
project.get("serviceRole"),
|
| 530 |
+
)
|
| 531 |
+
elif (
|
| 532 |
+
response["projectsNotFound"]
|
| 533 |
+
and project_name in response["projectsNotFound"]
|
| 534 |
+
):
|
| 535 |
+
# If the project name is explicitly in the 'projectsNotFound' list
|
| 536 |
+
print(f"CodeBuild project '{project_name}' not found.")
|
| 537 |
+
return False, None, None
|
| 538 |
+
else:
|
| 539 |
+
# This case is less expected for a single name lookup,
|
| 540 |
+
# but could happen if there's an internal issue or the response
|
| 541 |
+
# structure is slightly different than expected for an error.
|
| 542 |
+
# It's safer to assume it wasn't found if not in 'projects'.
|
| 543 |
+
print(
|
| 544 |
+
f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
|
| 545 |
+
)
|
| 546 |
+
return False, None, None
|
| 547 |
+
|
| 548 |
+
except ClientError as e:
|
| 549 |
+
# Catch specific ClientErrors. batch_get_projects might not throw
|
| 550 |
+
# 'InvalidInputException' for a non-existent project name if the
|
| 551 |
+
# name format is valid. It typically just lists it in projectsNotFound.
|
| 552 |
+
# However, other ClientErrors are possible (e.g., permissions).
|
| 553 |
+
print(
|
| 554 |
+
f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
|
| 555 |
+
)
|
| 556 |
+
# Decide how to handle other ClientErrors - raising might be safer
|
| 557 |
+
raise # Re-raise the original exception
|
| 558 |
+
except Exception as e:
|
| 559 |
+
print(
|
| 560 |
+
f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
|
| 561 |
+
)
|
| 562 |
+
# Decide how to handle other errors
|
| 563 |
+
raise # Re-raise the original exception
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
|
| 567 |
+
"""
|
| 568 |
+
Finds a VPC ID by its 'Name' tag.
|
| 569 |
+
"""
|
| 570 |
+
ec2_client = boto3.client("ec2")
|
| 571 |
+
try:
|
| 572 |
+
response = ec2_client.describe_vpcs(
|
| 573 |
+
Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
|
| 574 |
+
)
|
| 575 |
+
if response and response["Vpcs"]:
|
| 576 |
+
vpc_id = response["Vpcs"][0]["VpcId"]
|
| 577 |
+
print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
|
| 578 |
+
|
| 579 |
+
# In get_vpc_id_by_name, after finding VPC ID:
|
| 580 |
+
|
| 581 |
+
# Look for NAT Gateways in this VPC
|
| 582 |
+
ec2_client = boto3.client("ec2")
|
| 583 |
+
nat_gateways = []
|
| 584 |
+
try:
|
| 585 |
+
response = ec2_client.describe_nat_gateways(
|
| 586 |
+
Filters=[
|
| 587 |
+
{"Name": "vpc-id", "Values": [vpc_id]},
|
| 588 |
+
# Optional: Add a tag filter if you consistently tag your NATs
|
| 589 |
+
# {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
|
| 590 |
+
]
|
| 591 |
+
)
|
| 592 |
+
nat_gateways = response.get("NatGateways", [])
|
| 593 |
+
except Exception as e:
|
| 594 |
+
print(
|
| 595 |
+
f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
|
| 596 |
+
)
|
| 597 |
+
# Decide how to handle this error - proceed or raise?
|
| 598 |
+
|
| 599 |
+
# Decide how to identify the specific NAT Gateway you want to check for.
|
| 600 |
+
|
| 601 |
+
return vpc_id, nat_gateways
|
| 602 |
+
else:
|
| 603 |
+
print(f"VPC '{vpc_name}' not found.")
|
| 604 |
+
return None
|
| 605 |
+
except Exception as e:
|
| 606 |
+
print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
|
| 607 |
+
raise
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
# --- Helper to fetch all existing subnets in a VPC once ---
|
| 611 |
+
def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
|
| 612 |
+
"""
|
| 613 |
+
Fetches all subnets in a given VPC.
|
| 614 |
+
Returns a dictionary with 'by_name' (map of name to subnet data),
|
| 615 |
+
'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
|
| 616 |
+
"""
|
| 617 |
+
ec2_client = boto3.client("ec2")
|
| 618 |
+
existing_subnets_data = {
|
| 619 |
+
"by_name": {}, # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
|
| 620 |
+
"by_id": {}, # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x/x'}}
|
| 621 |
+
"cidr_networks": [], # List of ipaddress.IPv4Network objects
|
| 622 |
+
}
|
| 623 |
+
try:
|
| 624 |
+
subnet_to_route_table: Dict[str, str] = {}
|
| 625 |
+
rt_response = ec2_client.describe_route_tables(
|
| 626 |
+
Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
|
| 627 |
+
)
|
| 628 |
+
for route_table in rt_response.get("RouteTables", []):
|
| 629 |
+
route_table_id = route_table["RouteTableId"]
|
| 630 |
+
for association in route_table.get("Associations", []):
|
| 631 |
+
associated_subnet_id = association.get("SubnetId")
|
| 632 |
+
if associated_subnet_id:
|
| 633 |
+
subnet_to_route_table[associated_subnet_id] = route_table_id
|
| 634 |
+
|
| 635 |
+
response = ec2_client.describe_subnets(
|
| 636 |
+
Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
|
| 637 |
+
)
|
| 638 |
+
for s in response.get("Subnets", []):
|
| 639 |
+
subnet_id = s["SubnetId"]
|
| 640 |
+
cidr_block = s.get("CidrBlock")
|
| 641 |
+
# Extract 'Name' tag, which is crucial for lookup by name
|
| 642 |
+
name_tag = next(
|
| 643 |
+
(tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
|
| 644 |
+
None,
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
+
subnet_info = {
|
| 648 |
+
"id": subnet_id,
|
| 649 |
+
"cidr": cidr_block,
|
| 650 |
+
"name": name_tag,
|
| 651 |
+
"az": s.get("AvailabilityZone"),
|
| 652 |
+
"route_table_id": subnet_to_route_table.get(subnet_id),
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
if name_tag:
|
| 656 |
+
existing_subnets_data["by_name"][name_tag] = subnet_info
|
| 657 |
+
existing_subnets_data["by_id"][subnet_id] = subnet_info
|
| 658 |
+
|
| 659 |
+
if cidr_block:
|
| 660 |
+
try:
|
| 661 |
+
existing_subnets_data["cidr_networks"].append(
|
| 662 |
+
ipaddress.ip_network(cidr_block, strict=False)
|
| 663 |
+
)
|
| 664 |
+
except ValueError:
|
| 665 |
+
print(
|
| 666 |
+
f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
|
| 667 |
+
)
|
| 668 |
+
|
| 669 |
+
print(
|
| 670 |
+
f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
|
| 671 |
+
)
|
| 672 |
+
except Exception as e:
|
| 673 |
+
print(
|
| 674 |
+
f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
|
| 675 |
+
)
|
| 676 |
+
raise # Re-raise if this essential step fails
|
| 677 |
+
|
| 678 |
+
return existing_subnets_data
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
# --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
|
| 682 |
+
def validate_subnet_creation_parameters(
|
| 683 |
+
vpc_id: str,
|
| 684 |
+
proposed_subnets_data: List[
|
| 685 |
+
Dict[str, str]
|
| 686 |
+
], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
|
| 687 |
+
existing_aws_subnets_data: Dict[
|
| 688 |
+
str, Any
|
| 689 |
+
], # Pre-fetched data from _get_existing_subnets_in_vpc
|
| 690 |
+
) -> None:
|
| 691 |
+
"""
|
| 692 |
+
Validates proposed subnet names and CIDR blocks against existing AWS subnets
|
| 693 |
+
in the specified VPC and against each other.
|
| 694 |
+
This function uses pre-fetched AWS subnet data.
|
| 695 |
+
|
| 696 |
+
Args:
|
| 697 |
+
vpc_id: The ID of the VPC (for logging/error messages).
|
| 698 |
+
proposed_subnets_data: A list of dictionaries, where each dict represents
|
| 699 |
+
a proposed subnet with 'name', 'cidr', and 'az'.
|
| 700 |
+
existing_aws_subnets_data: Dictionary containing existing AWS subnet data
|
| 701 |
+
(e.g., from _get_existing_subnets_in_vpc).
|
| 702 |
+
|
| 703 |
+
Raises:
|
| 704 |
+
ValueError: If any proposed subnet name or CIDR block
|
| 705 |
+
conflicts with existing AWS resources or other proposed resources.
|
| 706 |
+
"""
|
| 707 |
+
if not proposed_subnets_data:
|
| 708 |
+
print("No proposed subnet data provided for validation. Skipping.")
|
| 709 |
+
return
|
| 710 |
+
|
| 711 |
+
print(
|
| 712 |
+
f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
|
| 716 |
+
|
| 717 |
+
existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
|
| 718 |
+
existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
|
| 719 |
+
|
| 720 |
+
# Sets to track names and list to track networks for internal batch consistency
|
| 721 |
+
proposed_names_seen: set[str] = set()
|
| 722 |
+
proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
|
| 723 |
+
|
| 724 |
+
for i, proposed_subnet in enumerate(proposed_subnets_data):
|
| 725 |
+
subnet_name = proposed_subnet.get("name")
|
| 726 |
+
cidr_block_str = proposed_subnet.get("cidr")
|
| 727 |
+
availability_zone = proposed_subnet.get("az")
|
| 728 |
+
|
| 729 |
+
if not all([subnet_name, cidr_block_str, availability_zone]):
|
| 730 |
+
raise ValueError(
|
| 731 |
+
f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
# 1. Check for duplicate names within the proposed batch
|
| 735 |
+
if subnet_name in proposed_names_seen:
|
| 736 |
+
raise ValueError(
|
| 737 |
+
f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
|
| 738 |
+
)
|
| 739 |
+
proposed_names_seen.add(subnet_name)
|
| 740 |
+
|
| 741 |
+
# 2. Check for duplicate names against existing AWS subnets
|
| 742 |
+
if subnet_name in existing_aws_subnet_names:
|
| 743 |
+
print(
|
| 744 |
+
f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
|
| 745 |
+
)
|
| 746 |
+
|
| 747 |
+
# Parse proposed CIDR
|
| 748 |
+
try:
|
| 749 |
+
proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
|
| 750 |
+
except ValueError as e:
|
| 751 |
+
raise ValueError(
|
| 752 |
+
f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
# 3. Check for overlapping CIDRs within the proposed batch
|
| 756 |
+
for existing_proposed_net in proposed_cidr_networks_seen:
|
| 757 |
+
if proposed_net.overlaps(existing_proposed_net):
|
| 758 |
+
raise ValueError(
|
| 759 |
+
f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
|
| 760 |
+
f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
|
| 761 |
+
f"within the same batch."
|
| 762 |
+
)
|
| 763 |
+
|
| 764 |
+
# 4. Check for overlapping CIDRs against existing AWS subnets
|
| 765 |
+
for existing_aws_net in existing_aws_cidr_networks:
|
| 766 |
+
if proposed_net.overlaps(existing_aws_net):
|
| 767 |
+
raise ValueError(
|
| 768 |
+
f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
|
| 769 |
+
f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
|
| 770 |
+
f"in VPC '{vpc_id}'."
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
# If all checks pass for this subnet, add its network to the list for subsequent checks
|
| 774 |
+
proposed_cidr_networks_seen.append(proposed_net)
|
| 775 |
+
print(
|
| 776 |
+
f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
|
| 777 |
+
)
|
| 778 |
+
|
| 779 |
+
print(
|
| 780 |
+
f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
# --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
|
| 785 |
+
def check_subnet_exists_by_name(
|
| 786 |
+
subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
|
| 787 |
+
) -> Tuple[bool, Optional[str]]:
|
| 788 |
+
"""
|
| 789 |
+
Checks if a subnet with the given name exists within the pre-fetched data.
|
| 790 |
+
|
| 791 |
+
Args:
|
| 792 |
+
subnet_name: The 'Name' tag value of the subnet to check.
|
| 793 |
+
existing_aws_subnets_data: Dictionary containing existing AWS subnet data
|
| 794 |
+
(e.g., from _get_existing_subnets_in_vpc).
|
| 795 |
+
|
| 796 |
+
Returns:
|
| 797 |
+
A tuple:
|
| 798 |
+
- The first element is True if the subnet exists, False otherwise.
|
| 799 |
+
- The second element is the Subnet ID if found, None otherwise.
|
| 800 |
+
"""
|
| 801 |
+
subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
|
| 802 |
+
if subnet_info:
|
| 803 |
+
print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
|
| 804 |
+
return True, subnet_info["id"]
|
| 805 |
+
else:
|
| 806 |
+
print(f"Subnet '{subnet_name}' not found.")
|
| 807 |
+
return False, None
|
| 808 |
+
|
| 809 |
+
|
| 810 |
+
def create_nat_gateway(
|
| 811 |
+
scope: Construct,
|
| 812 |
+
public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
|
| 813 |
+
nat_gateway_name: str,
|
| 814 |
+
nat_gateway_id_context_key: str,
|
| 815 |
+
) -> str:
|
| 816 |
+
"""
|
| 817 |
+
Creates a single NAT Gateway in the specified public subnet.
|
| 818 |
+
It does not handle lookup from context; the calling stack should do that.
|
| 819 |
+
Returns the CloudFormation Ref of the NAT Gateway ID.
|
| 820 |
+
"""
|
| 821 |
+
print(
|
| 822 |
+
f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
|
| 823 |
+
)
|
| 824 |
+
|
| 825 |
+
# Create an Elastic IP for the NAT Gateway
|
| 826 |
+
eip = ec2.CfnEIP(
|
| 827 |
+
scope,
|
| 828 |
+
NAT_GATEWAY_EIP_NAME,
|
| 829 |
+
tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
# Create the NAT Gateway
|
| 833 |
+
nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
|
| 834 |
+
nat_gateway = ec2.CfnNatGateway(
|
| 835 |
+
scope,
|
| 836 |
+
nat_gateway_logical_id,
|
| 837 |
+
subnet_id=public_subnet_for_nat.subnet_id, # Associate with the public subnet
|
| 838 |
+
allocation_id=eip.attr_allocation_id, # Associate with the EIP
|
| 839 |
+
tags=[CfnTag(key="Name", value=nat_gateway_name)],
|
| 840 |
+
)
|
| 841 |
+
# The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
|
| 842 |
+
nat_gateway.add_dependency(eip)
|
| 843 |
+
|
| 844 |
+
# *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
|
| 845 |
+
# This is how you will get the ID to put into cdk.context.json
|
| 846 |
+
CfnOutput(
|
| 847 |
+
scope,
|
| 848 |
+
"SingleNatGatewayIdOutput",
|
| 849 |
+
value=nat_gateway.ref,
|
| 850 |
+
description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
|
| 851 |
+
export_name=f"{scope.stack_name}-NatGatewayId", # Make export name unique
|
| 852 |
+
)
|
| 853 |
+
|
| 854 |
+
print(
|
| 855 |
+
f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
|
| 856 |
+
)
|
| 857 |
+
# Return the tokenised reference for use within this synthesis
|
| 858 |
+
return nat_gateway.ref
|
| 859 |
+
|
| 860 |
+
|
| 861 |
+
def create_subnets(
|
| 862 |
+
scope: Construct,
|
| 863 |
+
vpc: ec2.IVpc,
|
| 864 |
+
prefix: str,
|
| 865 |
+
subnet_names: List[str],
|
| 866 |
+
cidr_blocks: List[str],
|
| 867 |
+
availability_zones: List[str],
|
| 868 |
+
is_public: bool,
|
| 869 |
+
internet_gateway_id: Optional[str] = None,
|
| 870 |
+
single_nat_gateway_id: Optional[str] = None,
|
| 871 |
+
) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
|
| 872 |
+
"""
|
| 873 |
+
Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
|
| 874 |
+
for backward compatibility.
|
| 875 |
+
"""
|
| 876 |
+
# --- Validations remain the same ---
|
| 877 |
+
if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
|
| 878 |
+
raise ValueError(
|
| 879 |
+
"Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
|
| 880 |
+
)
|
| 881 |
+
if is_public and not internet_gateway_id:
|
| 882 |
+
raise ValueError("internet_gateway_id must be provided for public subnets.")
|
| 883 |
+
if not is_public and not single_nat_gateway_id:
|
| 884 |
+
raise ValueError(
|
| 885 |
+
"single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
|
| 886 |
+
)
|
| 887 |
+
|
| 888 |
+
# --- We will populate these lists with the L1 objects to return ---
|
| 889 |
+
created_subnets: List[ec2.CfnSubnet] = []
|
| 890 |
+
created_route_tables: List[ec2.CfnRouteTable] = []
|
| 891 |
+
|
| 892 |
+
subnet_type_tag = "public" if is_public else "private"
|
| 893 |
+
|
| 894 |
+
for i, subnet_name in enumerate(subnet_names):
|
| 895 |
+
logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
|
| 896 |
+
|
| 897 |
+
# 1. Create the L2 Subnet (this is the easy part)
|
| 898 |
+
subnet = ec2.Subnet(
|
| 899 |
+
scope,
|
| 900 |
+
logical_id,
|
| 901 |
+
vpc_id=vpc.vpc_id,
|
| 902 |
+
cidr_block=cidr_blocks[i],
|
| 903 |
+
availability_zone=availability_zones[i],
|
| 904 |
+
map_public_ip_on_launch=is_public,
|
| 905 |
+
)
|
| 906 |
+
Tags.of(subnet).add("Name", subnet_name)
|
| 907 |
+
Tags.of(subnet).add("Type", subnet_type_tag)
|
| 908 |
+
|
| 909 |
+
if is_public:
|
| 910 |
+
# The subnet's route_table is automatically created by the L2 Subnet construct
|
| 911 |
+
try:
|
| 912 |
+
subnet.add_route(
|
| 913 |
+
"DefaultInternetRoute", # A logical ID for the CfnRoute resource
|
| 914 |
+
router_id=internet_gateway_id,
|
| 915 |
+
router_type=ec2.RouterType.GATEWAY,
|
| 916 |
+
# destination_cidr_block="0.0.0.0/0" is the default for this method
|
| 917 |
+
)
|
| 918 |
+
except Exception as e:
|
| 919 |
+
print("Could not create IGW route for public subnet due to:", e)
|
| 920 |
+
print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
|
| 921 |
+
else:
|
| 922 |
+
try:
|
| 923 |
+
# Using .add_route() for private subnets as well for consistency
|
| 924 |
+
subnet.add_route(
|
| 925 |
+
"DefaultNatRoute", # A logical ID for the CfnRoute resource
|
| 926 |
+
router_id=single_nat_gateway_id,
|
| 927 |
+
router_type=ec2.RouterType.NAT_GATEWAY,
|
| 928 |
+
)
|
| 929 |
+
except Exception as e:
|
| 930 |
+
print("Could not create NAT gateway route for public subnet due to:", e)
|
| 931 |
+
print(
|
| 932 |
+
f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
|
| 933 |
+
)
|
| 934 |
+
|
| 935 |
+
route_table = subnet.route_table
|
| 936 |
+
|
| 937 |
+
created_subnets.append(subnet)
|
| 938 |
+
created_route_tables.append(route_table)
|
| 939 |
+
|
| 940 |
+
return created_subnets, created_route_tables
|
| 941 |
+
|
| 942 |
+
|
| 943 |
+
def ingress_rule_exists(security_group: str, peer: str, port: str):
|
| 944 |
+
for rule in security_group.connections.security_groups:
|
| 945 |
+
if port:
|
| 946 |
+
if rule.peer == peer and rule.connection == port:
|
| 947 |
+
return True
|
| 948 |
+
else:
|
| 949 |
+
if rule.peer == peer:
|
| 950 |
+
return True
|
| 951 |
+
return False
|
| 952 |
+
|
| 953 |
+
|
| 954 |
+
def check_for_existing_user_pool(user_pool_name: str):
|
| 955 |
+
cognito_client = boto3.client("cognito-idp")
|
| 956 |
+
list_pools_response = cognito_client.list_user_pools(
|
| 957 |
+
MaxResults=60
|
| 958 |
+
) # MaxResults up to 60
|
| 959 |
+
|
| 960 |
+
# ListUserPools might require pagination if you have more than 60 pools
|
| 961 |
+
# This simple example doesn't handle pagination, which could miss your pool
|
| 962 |
+
|
| 963 |
+
existing_user_pool_id = ""
|
| 964 |
+
|
| 965 |
+
for pool in list_pools_response.get("UserPools", []):
|
| 966 |
+
if pool.get("Name") == user_pool_name:
|
| 967 |
+
existing_user_pool_id = pool["Id"]
|
| 968 |
+
print(
|
| 969 |
+
f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
|
| 970 |
+
)
|
| 971 |
+
break # Found the one we're looking for
|
| 972 |
+
|
| 973 |
+
if existing_user_pool_id:
|
| 974 |
+
return True, existing_user_pool_id, pool
|
| 975 |
+
else:
|
| 976 |
+
return False, "", ""
|
| 977 |
+
|
| 978 |
+
|
| 979 |
+
def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
|
| 980 |
+
"""
|
| 981 |
+
Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
|
| 982 |
+
|
| 983 |
+
Args:
|
| 984 |
+
user_pool_id: The ID of the Cognito User Pool.
|
| 985 |
+
user_pool_client_name: The name of the User Pool Client to check for.
|
| 986 |
+
|
| 987 |
+
Returns:
|
| 988 |
+
A tuple:
|
| 989 |
+
- True, client_id, client_details if the client exists.
|
| 990 |
+
- False, "", {} otherwise.
|
| 991 |
+
"""
|
| 992 |
+
cognito_client = boto3.client("cognito-idp")
|
| 993 |
+
next_token = "string"
|
| 994 |
+
|
| 995 |
+
while True:
|
| 996 |
+
try:
|
| 997 |
+
response = cognito_client.list_user_pool_clients(
|
| 998 |
+
UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
|
| 999 |
+
)
|
| 1000 |
+
except cognito_client.exceptions.ResourceNotFoundException:
|
| 1001 |
+
print(f"Error: User pool with ID '{user_pool_id}' not found.")
|
| 1002 |
+
return False, "", {}
|
| 1003 |
+
|
| 1004 |
+
except cognito_client.exceptions.InvalidParameterException:
|
| 1005 |
+
print(f"Error: No app clients for '{user_pool_id}' found.")
|
| 1006 |
+
return False, "", {}
|
| 1007 |
+
|
| 1008 |
+
except Exception as e:
|
| 1009 |
+
print("Could not check User Pool clients due to:", e)
|
| 1010 |
+
|
| 1011 |
+
for client in response.get("UserPoolClients", []):
|
| 1012 |
+
if client.get("ClientName") == user_pool_client_name:
|
| 1013 |
+
print(
|
| 1014 |
+
f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
|
| 1015 |
+
)
|
| 1016 |
+
return True, client["ClientId"], client
|
| 1017 |
+
|
| 1018 |
+
next_token = response.get("NextToken")
|
| 1019 |
+
if not next_token:
|
| 1020 |
+
break
|
| 1021 |
+
|
| 1022 |
+
return False, "", {}
|
| 1023 |
+
|
| 1024 |
+
|
| 1025 |
+
def check_for_secret(secret_name: str, secret_value: dict = ""):
|
| 1026 |
+
"""
|
| 1027 |
+
Checks if a Secrets Manager secret with the given name exists.
|
| 1028 |
+
If it doesn't exist, it creates the secret.
|
| 1029 |
+
|
| 1030 |
+
Args:
|
| 1031 |
+
secret_name: The name of the Secrets Manager secret.
|
| 1032 |
+
secret_value: A dictionary containing the key-value pairs for the secret.
|
| 1033 |
+
|
| 1034 |
+
Returns:
|
| 1035 |
+
True if the secret existed or was created, False otherwise (due to other errors).
|
| 1036 |
+
"""
|
| 1037 |
+
secretsmanager_client = boto3.client("secretsmanager")
|
| 1038 |
+
|
| 1039 |
+
try:
|
| 1040 |
+
# Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
|
| 1041 |
+
secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
|
| 1042 |
+
print("Secret already exists.")
|
| 1043 |
+
return True, secret_value
|
| 1044 |
+
except secretsmanager_client.exceptions.ResourceNotFoundException:
|
| 1045 |
+
print("Secret not found")
|
| 1046 |
+
return False, {}
|
| 1047 |
+
except Exception as e:
|
| 1048 |
+
# Handle other potential exceptions during the get operation
|
| 1049 |
+
print(f"Error checking for secret: {e}")
|
| 1050 |
+
return False, {}
|
| 1051 |
+
|
| 1052 |
+
|
| 1053 |
+
def check_alb_exists(
|
| 1054 |
+
load_balancer_name: str, region_name: str = None
|
| 1055 |
+
) -> tuple[bool, dict]:
|
| 1056 |
+
"""
|
| 1057 |
+
Checks if an Application Load Balancer (ALB) with the given name exists.
|
| 1058 |
+
|
| 1059 |
+
Args:
|
| 1060 |
+
load_balancer_name: The name of the ALB to check.
|
| 1061 |
+
region_name: The AWS region to check in. If None, uses the default
|
| 1062 |
+
session region.
|
| 1063 |
+
|
| 1064 |
+
Returns:
|
| 1065 |
+
A tuple:
|
| 1066 |
+
- The first element is True if the ALB exists, False otherwise.
|
| 1067 |
+
- The second element is the ALB object (dictionary) if found,
|
| 1068 |
+
None otherwise. Specifically, it returns the first element of
|
| 1069 |
+
the LoadBalancers list from the describe_load_balancers response.
|
| 1070 |
+
"""
|
| 1071 |
+
if region_name:
|
| 1072 |
+
elbv2_client = boto3.client("elbv2", region_name=region_name)
|
| 1073 |
+
else:
|
| 1074 |
+
elbv2_client = boto3.client("elbv2")
|
| 1075 |
+
try:
|
| 1076 |
+
response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
|
| 1077 |
+
if response["LoadBalancers"]:
|
| 1078 |
+
return (
|
| 1079 |
+
True,
|
| 1080 |
+
response["LoadBalancers"][0],
|
| 1081 |
+
) # Return True and the first ALB object
|
| 1082 |
+
else:
|
| 1083 |
+
return False, {}
|
| 1084 |
+
except ClientError as e:
|
| 1085 |
+
# If the error indicates the ALB doesn't exist, return False
|
| 1086 |
+
if e.response["Error"]["Code"] == "LoadBalancerNotFound":
|
| 1087 |
+
return False, {}
|
| 1088 |
+
else:
|
| 1089 |
+
# Re-raise other exceptions
|
| 1090 |
+
raise
|
| 1091 |
+
except Exception as e:
|
| 1092 |
+
print(f"An unexpected error occurred: {e}")
|
| 1093 |
+
return False, {}
|
| 1094 |
+
|
| 1095 |
+
|
| 1096 |
+
def check_fargate_task_definition_exists(
|
| 1097 |
+
task_definition_name: str, region_name: str = None
|
| 1098 |
+
) -> tuple[bool, dict]:
|
| 1099 |
+
"""
|
| 1100 |
+
Checks if a Fargate task definition with the given name exists.
|
| 1101 |
+
|
| 1102 |
+
Args:
|
| 1103 |
+
task_definition_name: The name or ARN of the task definition to check.
|
| 1104 |
+
region_name: The AWS region to check in. If None, uses the default
|
| 1105 |
+
session region.
|
| 1106 |
+
|
| 1107 |
+
Returns:
|
| 1108 |
+
A tuple:
|
| 1109 |
+
- The first element is True if the task definition exists, False otherwise.
|
| 1110 |
+
- The second element is the task definition object (dictionary) if found,
|
| 1111 |
+
None otherwise. Specifically, it returns the first element of the
|
| 1112 |
+
taskDefinitions list from the describe_task_definition response.
|
| 1113 |
+
"""
|
| 1114 |
+
if region_name:
|
| 1115 |
+
ecs_client = boto3.client("ecs", region_name=region_name)
|
| 1116 |
+
else:
|
| 1117 |
+
ecs_client = boto3.client("ecs")
|
| 1118 |
+
try:
|
| 1119 |
+
response = ecs_client.describe_task_definition(
|
| 1120 |
+
taskDefinition=task_definition_name
|
| 1121 |
+
)
|
| 1122 |
+
# If describe_task_definition succeeds, it returns the task definition.
|
| 1123 |
+
# We can directly return True and the task definition.
|
| 1124 |
+
return True, response["taskDefinition"]
|
| 1125 |
+
except ClientError as e:
|
| 1126 |
+
# Check for the error code indicating the task definition doesn't exist.
|
| 1127 |
+
if (
|
| 1128 |
+
e.response["Error"]["Code"] == "ClientException"
|
| 1129 |
+
and "Task definition" in e.response["Message"]
|
| 1130 |
+
and "does not exist" in e.response["Message"]
|
| 1131 |
+
):
|
| 1132 |
+
return False, {}
|
| 1133 |
+
else:
|
| 1134 |
+
# Re-raise other exceptions.
|
| 1135 |
+
raise
|
| 1136 |
+
except Exception as e:
|
| 1137 |
+
print(f"An unexpected error occurred: {e}")
|
| 1138 |
+
return False, {}
|
| 1139 |
+
|
| 1140 |
+
|
| 1141 |
+
def check_ecs_service_exists(
|
| 1142 |
+
cluster_name: str, service_name: str, region_name: str = None
|
| 1143 |
+
) -> tuple[bool, dict]:
|
| 1144 |
+
"""
|
| 1145 |
+
Checks if an ECS service with the given name exists in the specified cluster.
|
| 1146 |
+
|
| 1147 |
+
Args:
|
| 1148 |
+
cluster_name: The name or ARN of the ECS cluster.
|
| 1149 |
+
service_name: The name of the ECS service to check.
|
| 1150 |
+
region_name: The AWS region to check in. If None, uses the default
|
| 1151 |
+
session region.
|
| 1152 |
+
|
| 1153 |
+
Returns:
|
| 1154 |
+
A tuple:
|
| 1155 |
+
- The first element is True if the service exists, False otherwise.
|
| 1156 |
+
- The second element is the service object (dictionary) if found,
|
| 1157 |
+
None otherwise.
|
| 1158 |
+
"""
|
| 1159 |
+
if region_name:
|
| 1160 |
+
ecs_client = boto3.client("ecs", region_name=region_name)
|
| 1161 |
+
else:
|
| 1162 |
+
ecs_client = boto3.client("ecs")
|
| 1163 |
+
try:
|
| 1164 |
+
response = ecs_client.describe_services(
|
| 1165 |
+
cluster=cluster_name, services=[service_name]
|
| 1166 |
+
)
|
| 1167 |
+
if response["services"]:
|
| 1168 |
+
return (
|
| 1169 |
+
True,
|
| 1170 |
+
response["services"][0],
|
| 1171 |
+
) # Return True and the first service object
|
| 1172 |
+
else:
|
| 1173 |
+
return False, {}
|
| 1174 |
+
except ClientError as e:
|
| 1175 |
+
# Check for the error code indicating the service doesn't exist.
|
| 1176 |
+
if e.response["Error"]["Code"] == "ClusterNotFoundException":
|
| 1177 |
+
return False, {}
|
| 1178 |
+
elif e.response["Error"]["Code"] == "ServiceNotFoundException":
|
| 1179 |
+
return False, {}
|
| 1180 |
+
else:
|
| 1181 |
+
# Re-raise other exceptions.
|
| 1182 |
+
raise
|
| 1183 |
+
except Exception as e:
|
| 1184 |
+
print(f"An unexpected error occurred: {e}")
|
| 1185 |
+
return False, {}
|
| 1186 |
+
|
| 1187 |
+
|
| 1188 |
+
def check_cloudfront_distribution_exists(
|
| 1189 |
+
distribution_name: str, region_name: str = None
|
| 1190 |
+
) -> tuple[bool, dict | None]:
|
| 1191 |
+
"""
|
| 1192 |
+
Checks if a CloudFront distribution with the given name exists.
|
| 1193 |
+
|
| 1194 |
+
Args:
|
| 1195 |
+
distribution_name: The name of the CloudFront distribution to check.
|
| 1196 |
+
region_name: The AWS region to check in. If None, uses the default
|
| 1197 |
+
session region. Note: CloudFront is a global service,
|
| 1198 |
+
so the region is usually 'us-east-1', but this parameter
|
| 1199 |
+
is included for completeness.
|
| 1200 |
+
|
| 1201 |
+
Returns:
|
| 1202 |
+
A tuple:
|
| 1203 |
+
- The first element is True if the distribution exists, False otherwise.
|
| 1204 |
+
- The second element is the distribution object (dictionary) if found,
|
| 1205 |
+
None otherwise. Specifically, it returns the first element of the
|
| 1206 |
+
DistributionList from the ListDistributions response.
|
| 1207 |
+
"""
|
| 1208 |
+
if region_name:
|
| 1209 |
+
cf_client = boto3.client("cloudfront", region_name=region_name)
|
| 1210 |
+
else:
|
| 1211 |
+
cf_client = boto3.client("cloudfront")
|
| 1212 |
+
try:
|
| 1213 |
+
response = cf_client.list_distributions()
|
| 1214 |
+
if "Items" in response["DistributionList"]:
|
| 1215 |
+
for distribution in response["DistributionList"]["Items"]:
|
| 1216 |
+
# CloudFront doesn't directly filter by name, so we have to iterate.
|
| 1217 |
+
if (
|
| 1218 |
+
distribution["AliasSet"]["Items"]
|
| 1219 |
+
and distribution["AliasSet"]["Items"][0] == distribution_name
|
| 1220 |
+
):
|
| 1221 |
+
return True, distribution
|
| 1222 |
+
return False, None
|
| 1223 |
+
else:
|
| 1224 |
+
return False, None
|
| 1225 |
+
except ClientError as e:
|
| 1226 |
+
# If the error indicates the Distribution doesn't exist, return False
|
| 1227 |
+
if e.response["Error"]["Code"] == "NoSuchDistribution":
|
| 1228 |
+
return False, None
|
| 1229 |
+
else:
|
| 1230 |
+
# Re-raise other exceptions
|
| 1231 |
+
raise
|
| 1232 |
+
except Exception as e:
|
| 1233 |
+
print(f"An unexpected error occurred: {e}")
|
| 1234 |
+
return False, None
|
| 1235 |
+
|
| 1236 |
+
|
| 1237 |
+
def create_web_acl_with_common_rules(
|
| 1238 |
+
scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
|
| 1239 |
+
):
|
| 1240 |
+
"""
|
| 1241 |
+
Use CDK to create a web ACL based on an AWS common rule set with overrides.
|
| 1242 |
+
This function now expects a 'scope' argument, typically 'self' from your stack,
|
| 1243 |
+
as CfnWebACL requires a construct scope.
|
| 1244 |
+
"""
|
| 1245 |
+
|
| 1246 |
+
# Create full list of rules
|
| 1247 |
+
rules = []
|
| 1248 |
+
aws_ruleset_names = [
|
| 1249 |
+
"AWSManagedRulesCommonRuleSet",
|
| 1250 |
+
"AWSManagedRulesKnownBadInputsRuleSet",
|
| 1251 |
+
"AWSManagedRulesAmazonIpReputationList",
|
| 1252 |
+
]
|
| 1253 |
+
|
| 1254 |
+
# Use a separate counter to assign unique priorities sequentially
|
| 1255 |
+
priority_counter = 1
|
| 1256 |
+
|
| 1257 |
+
for aws_rule_name in aws_ruleset_names:
|
| 1258 |
+
current_rule_action_overrides = None
|
| 1259 |
+
|
| 1260 |
+
# All managed rule groups need an override_action.
|
| 1261 |
+
# 'none' means use the managed rule group's default action.
|
| 1262 |
+
current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
|
| 1263 |
+
|
| 1264 |
+
current_priority = priority_counter
|
| 1265 |
+
priority_counter += 1
|
| 1266 |
+
|
| 1267 |
+
if aws_rule_name == "AWSManagedRulesCommonRuleSet":
|
| 1268 |
+
current_rule_action_overrides = [
|
| 1269 |
+
wafv2.CfnWebACL.RuleActionOverrideProperty(
|
| 1270 |
+
name="SizeRestrictions_BODY",
|
| 1271 |
+
action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
|
| 1272 |
+
)
|
| 1273 |
+
]
|
| 1274 |
+
# No need to set current_override_action here, it's already set above.
|
| 1275 |
+
# If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
|
| 1276 |
+
# For now, it will get priority 1 from the counter.
|
| 1277 |
+
|
| 1278 |
+
rule_property = wafv2.CfnWebACL.RuleProperty(
|
| 1279 |
+
name=aws_rule_name,
|
| 1280 |
+
priority=current_priority,
|
| 1281 |
+
statement=wafv2.CfnWebACL.StatementProperty(
|
| 1282 |
+
managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
|
| 1283 |
+
vendor_name="AWS",
|
| 1284 |
+
name=aws_rule_name,
|
| 1285 |
+
rule_action_overrides=current_rule_action_overrides,
|
| 1286 |
+
)
|
| 1287 |
+
),
|
| 1288 |
+
visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
|
| 1289 |
+
cloud_watch_metrics_enabled=True,
|
| 1290 |
+
metric_name=aws_rule_name,
|
| 1291 |
+
sampled_requests_enabled=True,
|
| 1292 |
+
),
|
| 1293 |
+
override_action=current_override_action, # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
|
| 1294 |
+
)
|
| 1295 |
+
|
| 1296 |
+
rules.append(rule_property)
|
| 1297 |
+
|
| 1298 |
+
# Add the rate limit rule
|
| 1299 |
+
rate_limit_priority = priority_counter # Use the next available priority
|
| 1300 |
+
rules.append(
|
| 1301 |
+
wafv2.CfnWebACL.RuleProperty(
|
| 1302 |
+
name="RateLimitRule",
|
| 1303 |
+
priority=rate_limit_priority,
|
| 1304 |
+
statement=wafv2.CfnWebACL.StatementProperty(
|
| 1305 |
+
rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
|
| 1306 |
+
limit=1000, aggregate_key_type="IP"
|
| 1307 |
+
)
|
| 1308 |
+
),
|
| 1309 |
+
visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
|
| 1310 |
+
cloud_watch_metrics_enabled=True,
|
| 1311 |
+
metric_name="RateLimitRule",
|
| 1312 |
+
sampled_requests_enabled=True,
|
| 1313 |
+
),
|
| 1314 |
+
action=wafv2.CfnWebACL.RuleActionProperty(block={}),
|
| 1315 |
+
)
|
| 1316 |
+
)
|
| 1317 |
+
|
| 1318 |
+
web_acl = wafv2.CfnWebACL(
|
| 1319 |
+
scope,
|
| 1320 |
+
"WebACL",
|
| 1321 |
+
name=web_acl_name,
|
| 1322 |
+
default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
|
| 1323 |
+
scope=waf_scope,
|
| 1324 |
+
visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
|
| 1325 |
+
cloud_watch_metrics_enabled=True,
|
| 1326 |
+
metric_name="webACL",
|
| 1327 |
+
sampled_requests_enabled=True,
|
| 1328 |
+
),
|
| 1329 |
+
rules=rules,
|
| 1330 |
+
)
|
| 1331 |
+
|
| 1332 |
+
CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
|
| 1333 |
+
|
| 1334 |
+
return web_acl
|
| 1335 |
+
|
| 1336 |
+
|
| 1337 |
+
def check_web_acl_exists(
|
| 1338 |
+
web_acl_name: str, scope: str, region_name: str = None
|
| 1339 |
+
) -> tuple[bool, dict]:
|
| 1340 |
+
"""
|
| 1341 |
+
Checks if a Web ACL with the given name and scope exists.
|
| 1342 |
+
|
| 1343 |
+
Args:
|
| 1344 |
+
web_acl_name: The name of the Web ACL to check.
|
| 1345 |
+
scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
|
| 1346 |
+
region_name: The AWS region to check in. Required for REGIONAL scope.
|
| 1347 |
+
If None, uses the default session region. For CLOUDFRONT,
|
| 1348 |
+
the region should be 'us-east-1'.
|
| 1349 |
+
|
| 1350 |
+
Returns:
|
| 1351 |
+
A tuple:
|
| 1352 |
+
- The first element is True if the Web ACL exists, False otherwise.
|
| 1353 |
+
- The second element is the Web ACL object (dictionary) if found,
|
| 1354 |
+
None otherwise.
|
| 1355 |
+
"""
|
| 1356 |
+
if scope not in ["CLOUDFRONT", "REGIONAL"]:
|
| 1357 |
+
raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
|
| 1358 |
+
|
| 1359 |
+
if scope == "REGIONAL" and not region_name:
|
| 1360 |
+
raise ValueError("Region name is required for REGIONAL scope")
|
| 1361 |
+
|
| 1362 |
+
if scope == "CLOUDFRONT":
|
| 1363 |
+
region_name = "us-east-1" # CloudFront scope requires us-east-1
|
| 1364 |
+
|
| 1365 |
+
if region_name:
|
| 1366 |
+
waf_client = boto3.client("wafv2", region_name=region_name)
|
| 1367 |
+
else:
|
| 1368 |
+
waf_client = boto3.client("wafv2")
|
| 1369 |
+
try:
|
| 1370 |
+
response = waf_client.list_web_acls(Scope=scope)
|
| 1371 |
+
if "WebACLs" in response:
|
| 1372 |
+
for web_acl in response["WebACLs"]:
|
| 1373 |
+
if web_acl["Name"] == web_acl_name:
|
| 1374 |
+
# Describe the Web ACL to get the full object.
|
| 1375 |
+
describe_response = waf_client.describe_web_acl(
|
| 1376 |
+
Name=web_acl_name, Scope=scope
|
| 1377 |
+
)
|
| 1378 |
+
return True, describe_response["WebACL"]
|
| 1379 |
+
return False, {}
|
| 1380 |
+
else:
|
| 1381 |
+
return False, {}
|
| 1382 |
+
except ClientError as e:
|
| 1383 |
+
# Check for the error code indicating the web ACL doesn't exist.
|
| 1384 |
+
if e.response["Error"]["Code"] == "ResourceNotFoundException":
|
| 1385 |
+
return False, {}
|
| 1386 |
+
else:
|
| 1387 |
+
# Re-raise other exceptions.
|
| 1388 |
+
raise
|
| 1389 |
+
except Exception as e:
|
| 1390 |
+
print(f"An unexpected error occurred: {e}")
|
| 1391 |
+
return False, {}
|
| 1392 |
+
|
| 1393 |
+
|
| 1394 |
+
def add_alb_https_listener_with_cert(
|
| 1395 |
+
scope: Construct,
|
| 1396 |
+
logical_id: str, # A unique ID for this listener construct
|
| 1397 |
+
alb: elb.ApplicationLoadBalancer,
|
| 1398 |
+
acm_certificate_arn: Optional[
|
| 1399 |
+
str
|
| 1400 |
+
], # Optional: If None, no HTTPS listener will be created
|
| 1401 |
+
default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
|
| 1402 |
+
listener_port_https: int = 443,
|
| 1403 |
+
listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
|
| 1404 |
+
# --- Cognito Authentication Parameters ---
|
| 1405 |
+
enable_cognito_auth: bool = False,
|
| 1406 |
+
cognito_user_pool: Optional[cognito.IUserPool] = None,
|
| 1407 |
+
cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
|
| 1408 |
+
cognito_user_pool_domain: Optional[
|
| 1409 |
+
str
|
| 1410 |
+
] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
|
| 1411 |
+
cognito_auth_scope: Optional[
|
| 1412 |
+
str
|
| 1413 |
+
] = "openid profile email", # Default recommended scope
|
| 1414 |
+
cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
|
| 1415 |
+
stickiness_cookie_duration=None,
|
| 1416 |
+
# --- End Cognito Parameters ---
|
| 1417 |
+
) -> Optional[elb.ApplicationListener]:
|
| 1418 |
+
"""
|
| 1419 |
+
Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
|
| 1420 |
+
and optionally enables Cognito User Pool authentication.
|
| 1421 |
+
|
| 1422 |
+
Args:
|
| 1423 |
+
scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
|
| 1424 |
+
logical_id (str): A unique logical ID for the listener construct within the stack.
|
| 1425 |
+
alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
|
| 1426 |
+
acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
|
| 1427 |
+
If None, the HTTPS listener will NOT be created.
|
| 1428 |
+
default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
|
| 1429 |
+
This is mandatory for a functional listener.
|
| 1430 |
+
listener_port_https (int): The HTTPS port to listen on (default: 443).
|
| 1431 |
+
listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
|
| 1432 |
+
If False (recommended), ensure your ALB's security group allows
|
| 1433 |
+
inbound traffic on this port from desired sources.
|
| 1434 |
+
enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
|
| 1435 |
+
cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
|
| 1436 |
+
cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
|
| 1437 |
+
cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
|
| 1438 |
+
cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
|
| 1439 |
+
cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
|
| 1440 |
+
Defaults to AUTHENTICATE (redirect to login).
|
| 1441 |
+
|
| 1442 |
+
Returns:
|
| 1443 |
+
Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
|
| 1444 |
+
None if no ACM certificate ARN was provided.
|
| 1445 |
+
"""
|
| 1446 |
+
https_listener = None
|
| 1447 |
+
if acm_certificate_arn:
|
| 1448 |
+
certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
|
| 1449 |
+
print(
|
| 1450 |
+
f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
|
| 1451 |
+
)
|
| 1452 |
+
|
| 1453 |
+
# Determine the default action based on whether Cognito auth is enabled
|
| 1454 |
+
default_action = None
|
| 1455 |
+
if enable_cognito_auth is True:
|
| 1456 |
+
if not all(
|
| 1457 |
+
[cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
|
| 1458 |
+
):
|
| 1459 |
+
raise ValueError(
|
| 1460 |
+
"Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
|
| 1461 |
+
)
|
| 1462 |
+
print(
|
| 1463 |
+
f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
|
| 1464 |
+
)
|
| 1465 |
+
|
| 1466 |
+
default_action = elb_act.AuthenticateCognitoAction(
|
| 1467 |
+
next=elb.ListenerAction.forward(
|
| 1468 |
+
[default_target_group]
|
| 1469 |
+
), # After successful auth, forward to TG
|
| 1470 |
+
user_pool=cognito_user_pool,
|
| 1471 |
+
user_pool_client=cognito_user_pool_client,
|
| 1472 |
+
user_pool_domain=cognito_user_pool_domain,
|
| 1473 |
+
scope=cognito_auth_scope,
|
| 1474 |
+
on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
|
| 1475 |
+
session_timeout=stickiness_cookie_duration,
|
| 1476 |
+
# Additional options you might want to configure:
|
| 1477 |
+
# session_cookie_name="AWSELBCookies"
|
| 1478 |
+
)
|
| 1479 |
+
else:
|
| 1480 |
+
default_action = elb.ListenerAction.forward([default_target_group])
|
| 1481 |
+
print("Cognito authentication is NOT enabled for this listener.")
|
| 1482 |
+
|
| 1483 |
+
# Add the HTTPS listener
|
| 1484 |
+
https_listener = alb.add_listener(
|
| 1485 |
+
logical_id,
|
| 1486 |
+
port=listener_port_https,
|
| 1487 |
+
open=listener_open_to_internet,
|
| 1488 |
+
certificates=certificates_list,
|
| 1489 |
+
default_action=default_action, # Use the determined default action
|
| 1490 |
+
)
|
| 1491 |
+
print(f"ALB HTTPS listener on port {listener_port_https} defined.")
|
| 1492 |
+
else:
|
| 1493 |
+
print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
|
| 1494 |
+
|
| 1495 |
+
return https_listener
|
| 1496 |
+
|
| 1497 |
+
|
| 1498 |
+
def ensure_folder_exists(output_folder: str):
|
| 1499 |
+
"""Checks if the specified folder exists, creates it if not."""
|
| 1500 |
+
|
| 1501 |
+
if not os.path.exists(output_folder):
|
| 1502 |
+
# Create the folder if it doesn't exist
|
| 1503 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 1504 |
+
print(f"Created the {output_folder} folder.")
|
| 1505 |
+
else:
|
| 1506 |
+
print(f"The {output_folder} folder already exists.")
|
| 1507 |
+
|
| 1508 |
+
|
| 1509 |
+
def create_basic_config_env(
|
| 1510 |
+
out_dir: str = "config",
|
| 1511 |
+
S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
|
| 1512 |
+
S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
|
| 1513 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
|
| 1514 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
|
| 1515 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
|
| 1516 |
+
):
|
| 1517 |
+
"""
|
| 1518 |
+
Create a basic config.env file for the user to use with their newly deployed redaction app.
|
| 1519 |
+
"""
|
| 1520 |
+
variables = {
|
| 1521 |
+
"COGNITO_AUTH": "True",
|
| 1522 |
+
"RUN_AWS_FUNCTIONS": "True",
|
| 1523 |
+
"DISPLAY_FILE_NAMES_IN_LOGS": "False",
|
| 1524 |
+
"SESSION_OUTPUT_FOLDER": "True",
|
| 1525 |
+
"SAVE_LOGS_TO_DYNAMODB": "True",
|
| 1526 |
+
"SHOW_COSTS": "True",
|
| 1527 |
+
"SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
|
| 1528 |
+
"LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
|
| 1529 |
+
"DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
|
| 1530 |
+
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
|
| 1531 |
+
"ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
|
| 1532 |
+
"FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
|
| 1533 |
+
"USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
|
| 1534 |
+
}
|
| 1535 |
+
|
| 1536 |
+
# Write variables to .env file
|
| 1537 |
+
ensure_folder_exists(out_dir + "/")
|
| 1538 |
+
env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
|
| 1539 |
+
|
| 1540 |
+
# It's good practice to ensure the file exists before calling set_key repeatedly.
|
| 1541 |
+
# set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
|
| 1542 |
+
if not os.path.exists(env_file_path):
|
| 1543 |
+
with open(env_file_path, "w"):
|
| 1544 |
+
pass # Create empty file
|
| 1545 |
+
|
| 1546 |
+
for key, value in variables.items():
|
| 1547 |
+
set_key(env_file_path, key, str(value), quote_mode="never")
|
| 1548 |
+
|
| 1549 |
+
return variables
|
| 1550 |
+
|
| 1551 |
+
|
| 1552 |
+
def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
|
| 1553 |
+
"""
|
| 1554 |
+
Start an existing Codebuild project build
|
| 1555 |
+
"""
|
| 1556 |
+
|
| 1557 |
+
# --- Initialize CodeBuild client ---
|
| 1558 |
+
client = boto3.client("codebuild", region_name=AWS_REGION)
|
| 1559 |
+
|
| 1560 |
+
try:
|
| 1561 |
+
print(f"Attempting to start build for project: {PROJECT_NAME}")
|
| 1562 |
+
|
| 1563 |
+
response = client.start_build(projectName=PROJECT_NAME)
|
| 1564 |
+
|
| 1565 |
+
build_id = response["build"]["id"]
|
| 1566 |
+
print(f"Successfully started build with ID: {build_id}")
|
| 1567 |
+
print(f"Build ARN: {response['build']['arn']}")
|
| 1568 |
+
print("Build URL (approximate - construct based on region and ID):")
|
| 1569 |
+
print(
|
| 1570 |
+
f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
|
| 1571 |
+
)
|
| 1572 |
+
|
| 1573 |
+
# You can inspect the full response if needed
|
| 1574 |
+
# print("\nFull response:")
|
| 1575 |
+
# import json
|
| 1576 |
+
# print(json.dumps(response, indent=2))
|
| 1577 |
+
|
| 1578 |
+
except client.exceptions.ResourceNotFoundException:
|
| 1579 |
+
print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
|
| 1580 |
+
except Exception as e:
|
| 1581 |
+
print(f"An unexpected error occurred: {e}")
|
| 1582 |
+
|
| 1583 |
+
|
| 1584 |
+
def upload_file_to_s3(
|
| 1585 |
+
local_file_paths: List[str],
|
| 1586 |
+
s3_key: str,
|
| 1587 |
+
s3_bucket: str,
|
| 1588 |
+
RUN_AWS_FUNCTIONS: str = "1",
|
| 1589 |
+
):
|
| 1590 |
+
"""
|
| 1591 |
+
Uploads a file from local machine to Amazon S3.
|
| 1592 |
+
|
| 1593 |
+
Args:
|
| 1594 |
+
- local_file_path: Local file path(s) of the file(s) to upload.
|
| 1595 |
+
- s3_key: Key (path) to the file in the S3 bucket.
|
| 1596 |
+
- s3_bucket: Name of the S3 bucket.
|
| 1597 |
+
|
| 1598 |
+
Returns:
|
| 1599 |
+
- Message as variable/printed to console
|
| 1600 |
+
"""
|
| 1601 |
+
final_out_message = []
|
| 1602 |
+
final_out_message_str = ""
|
| 1603 |
+
|
| 1604 |
+
if RUN_AWS_FUNCTIONS == "1":
|
| 1605 |
+
try:
|
| 1606 |
+
if s3_bucket and local_file_paths:
|
| 1607 |
+
|
| 1608 |
+
s3_client = boto3.client("s3", region_name=AWS_REGION)
|
| 1609 |
+
|
| 1610 |
+
if isinstance(local_file_paths, str):
|
| 1611 |
+
local_file_paths = [local_file_paths]
|
| 1612 |
+
|
| 1613 |
+
for file in local_file_paths:
|
| 1614 |
+
if s3_client:
|
| 1615 |
+
# print(s3_client)
|
| 1616 |
+
try:
|
| 1617 |
+
# Get file name off file path
|
| 1618 |
+
file_name = os.path.basename(file)
|
| 1619 |
+
|
| 1620 |
+
s3_key_full = s3_key + file_name
|
| 1621 |
+
print("S3 key: ", s3_key_full)
|
| 1622 |
+
|
| 1623 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
| 1624 |
+
out_message = (
|
| 1625 |
+
"File " + file_name + " uploaded successfully!"
|
| 1626 |
+
)
|
| 1627 |
+
print(out_message)
|
| 1628 |
+
|
| 1629 |
+
except Exception as e:
|
| 1630 |
+
out_message = f"Error uploading file(s): {e}"
|
| 1631 |
+
print(out_message)
|
| 1632 |
+
|
| 1633 |
+
final_out_message.append(out_message)
|
| 1634 |
+
final_out_message_str = "\n".join(final_out_message)
|
| 1635 |
+
|
| 1636 |
+
else:
|
| 1637 |
+
final_out_message_str = "Could not connect to AWS."
|
| 1638 |
+
else:
|
| 1639 |
+
final_out_message_str = (
|
| 1640 |
+
"At least one essential variable is empty, could not upload to S3"
|
| 1641 |
+
)
|
| 1642 |
+
except Exception as e:
|
| 1643 |
+
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
| 1644 |
+
print(final_out_message_str)
|
| 1645 |
+
else:
|
| 1646 |
+
final_out_message_str = "App not set to run AWS functions"
|
| 1647 |
+
|
| 1648 |
+
return final_out_message_str
|
| 1649 |
+
|
| 1650 |
+
|
| 1651 |
+
# Initialize ECS client
|
| 1652 |
+
def start_ecs_task(cluster_name, service_name):
|
| 1653 |
+
ecs_client = boto3.client("ecs")
|
| 1654 |
+
|
| 1655 |
+
try:
|
| 1656 |
+
# Update the service to set the desired count to 1
|
| 1657 |
+
ecs_client.update_service(
|
| 1658 |
+
cluster=cluster_name, service=service_name, desiredCount=1
|
| 1659 |
+
)
|
| 1660 |
+
return {
|
| 1661 |
+
"statusCode": 200,
|
| 1662 |
+
"body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
|
| 1663 |
+
}
|
| 1664 |
+
except Exception as e:
|
| 1665 |
+
return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}
|
cdk/cdk_stack.py
ADDED
|
@@ -0,0 +1,1991 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json # You might still need json if loading task_definition.json
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any, Dict, List
|
| 4 |
+
|
| 5 |
+
from aws_cdk import (
|
| 6 |
+
CfnOutput, # <-- Import CfnOutput directly
|
| 7 |
+
Duration,
|
| 8 |
+
RemovalPolicy,
|
| 9 |
+
SecretValue,
|
| 10 |
+
Stack,
|
| 11 |
+
)
|
| 12 |
+
from aws_cdk import aws_cloudfront as cloudfront
|
| 13 |
+
from aws_cdk import aws_cloudfront_origins as origins
|
| 14 |
+
from aws_cdk import aws_codebuild as codebuild
|
| 15 |
+
from aws_cdk import aws_cognito as cognito
|
| 16 |
+
from aws_cdk import aws_dynamodb as dynamodb # Import the DynamoDB module
|
| 17 |
+
from aws_cdk import aws_ec2 as ec2
|
| 18 |
+
from aws_cdk import aws_ecr as ecr
|
| 19 |
+
from aws_cdk import aws_ecs as ecs
|
| 20 |
+
from aws_cdk import aws_elasticloadbalancingv2 as elbv2
|
| 21 |
+
from aws_cdk import aws_iam as iam
|
| 22 |
+
from aws_cdk import aws_kms as kms
|
| 23 |
+
from aws_cdk import aws_logs as logs
|
| 24 |
+
from aws_cdk import aws_s3 as s3
|
| 25 |
+
from aws_cdk import aws_secretsmanager as secretsmanager
|
| 26 |
+
from aws_cdk import aws_wafv2 as wafv2
|
| 27 |
+
from cdk_config import (
|
| 28 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME,
|
| 29 |
+
ACM_SSL_CERTIFICATE_ARN,
|
| 30 |
+
ALB_NAME,
|
| 31 |
+
ALB_NAME_SECURITY_GROUP_NAME,
|
| 32 |
+
ALB_TARGET_GROUP_NAME,
|
| 33 |
+
AWS_ACCOUNT_ID,
|
| 34 |
+
AWS_MANAGED_TASK_ROLES_LIST,
|
| 35 |
+
AWS_REGION,
|
| 36 |
+
CDK_PREFIX,
|
| 37 |
+
CLOUDFRONT_DISTRIBUTION_NAME,
|
| 38 |
+
CLOUDFRONT_GEO_RESTRICTION,
|
| 39 |
+
CLUSTER_NAME,
|
| 40 |
+
CODEBUILD_PROJECT_NAME,
|
| 41 |
+
CODEBUILD_ROLE_NAME,
|
| 42 |
+
COGNITO_ACCESS_TOKEN_VALIDITY,
|
| 43 |
+
COGNITO_ID_TOKEN_VALIDITY,
|
| 44 |
+
COGNITO_REDIRECTION_URL,
|
| 45 |
+
COGNITO_REFRESH_TOKEN_VALIDITY,
|
| 46 |
+
COGNITO_USER_POOL_CLIENT_NAME,
|
| 47 |
+
COGNITO_USER_POOL_CLIENT_SECRET_NAME,
|
| 48 |
+
COGNITO_USER_POOL_DOMAIN_PREFIX,
|
| 49 |
+
COGNITO_USER_POOL_NAME,
|
| 50 |
+
CUSTOM_HEADER,
|
| 51 |
+
CUSTOM_HEADER_VALUE,
|
| 52 |
+
CUSTOM_KMS_KEY_NAME,
|
| 53 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
|
| 54 |
+
ECR_CDK_REPO_NAME,
|
| 55 |
+
ECS_LOG_GROUP_NAME,
|
| 56 |
+
ECS_READ_ONLY_FILE_SYSTEM,
|
| 57 |
+
ECS_SECURITY_GROUP_NAME,
|
| 58 |
+
ECS_SERVICE_NAME,
|
| 59 |
+
ECS_TASK_CPU_SIZE,
|
| 60 |
+
ECS_TASK_EXECUTION_ROLE_NAME,
|
| 61 |
+
ECS_TASK_MEMORY_SIZE,
|
| 62 |
+
ECS_TASK_ROLE_NAME,
|
| 63 |
+
ECS_USE_FARGATE_SPOT,
|
| 64 |
+
EXISTING_IGW_ID,
|
| 65 |
+
EXISTING_LOAD_BALANCER_ARN,
|
| 66 |
+
EXISTING_LOAD_BALANCER_DNS,
|
| 67 |
+
FARGATE_TASK_DEFINITION_NAME,
|
| 68 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
|
| 69 |
+
GITHUB_REPO_BRANCH,
|
| 70 |
+
GITHUB_REPO_NAME,
|
| 71 |
+
GITHUB_REPO_USERNAME,
|
| 72 |
+
GRADIO_SERVER_PORT,
|
| 73 |
+
LOAD_BALANCER_WEB_ACL_NAME,
|
| 74 |
+
NAT_GATEWAY_NAME,
|
| 75 |
+
NEW_VPC_CIDR,
|
| 76 |
+
NEW_VPC_DEFAULT_NAME,
|
| 77 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES,
|
| 78 |
+
PRIVATE_SUBNET_CIDR_BLOCKS,
|
| 79 |
+
PRIVATE_SUBNETS_TO_USE,
|
| 80 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES,
|
| 81 |
+
PUBLIC_SUBNET_CIDR_BLOCKS,
|
| 82 |
+
PUBLIC_SUBNETS_TO_USE,
|
| 83 |
+
S3_LOG_CONFIG_BUCKET_NAME,
|
| 84 |
+
S3_OUTPUT_BUCKET_NAME,
|
| 85 |
+
SAVE_LOGS_TO_DYNAMODB,
|
| 86 |
+
SINGLE_NAT_GATEWAY_ID,
|
| 87 |
+
TASK_DEFINITION_FILE_LOCATION,
|
| 88 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME,
|
| 89 |
+
USE_CLOUDFRONT,
|
| 90 |
+
USE_CUSTOM_KMS_KEY,
|
| 91 |
+
VPC_NAME,
|
| 92 |
+
WEB_ACL_NAME,
|
| 93 |
+
)
|
| 94 |
+
from cdk_functions import ( # Only keep CDK-native functions
|
| 95 |
+
add_alb_https_listener_with_cert,
|
| 96 |
+
add_custom_policies,
|
| 97 |
+
create_nat_gateway,
|
| 98 |
+
create_subnets,
|
| 99 |
+
create_web_acl_with_common_rules,
|
| 100 |
+
)
|
| 101 |
+
from constructs import Construct
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
| 105 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
| 106 |
+
value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
|
| 107 |
+
if not value:
|
| 108 |
+
return []
|
| 109 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
| 110 |
+
return [s.strip() for s in value.split(",") if s.strip()]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# 1. Try to load CIDR/AZs from environment variables
|
| 114 |
+
if PUBLIC_SUBNETS_TO_USE:
|
| 115 |
+
PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
|
| 116 |
+
if PRIVATE_SUBNETS_TO_USE:
|
| 117 |
+
PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
|
| 118 |
+
|
| 119 |
+
if PUBLIC_SUBNET_CIDR_BLOCKS:
|
| 120 |
+
PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
|
| 121 |
+
if PUBLIC_SUBNET_AVAILABILITY_ZONES:
|
| 122 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
|
| 123 |
+
if PRIVATE_SUBNET_CIDR_BLOCKS:
|
| 124 |
+
PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
|
| 125 |
+
if PRIVATE_SUBNET_AVAILABILITY_ZONES:
|
| 126 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
|
| 127 |
+
"PRIVATE_SUBNET_AVAILABILITY_ZONES"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
if AWS_MANAGED_TASK_ROLES_LIST:
|
| 131 |
+
AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class CdkStack(Stack):
|
| 135 |
+
|
| 136 |
+
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
|
| 137 |
+
super().__init__(scope, construct_id, **kwargs)
|
| 138 |
+
|
| 139 |
+
# --- Helper to get context values ---
|
| 140 |
+
def get_context_bool(key: str, default: bool = False) -> bool:
|
| 141 |
+
value = self.node.try_get_context(key)
|
| 142 |
+
if value is None:
|
| 143 |
+
return default
|
| 144 |
+
if isinstance(value, bool):
|
| 145 |
+
return value
|
| 146 |
+
if isinstance(value, str):
|
| 147 |
+
return value.lower() in ("true", "1", "yes")
|
| 148 |
+
return bool(value)
|
| 149 |
+
|
| 150 |
+
def get_context_str(key: str, default: str = None) -> str:
|
| 151 |
+
return self.node.try_get_context(key) or default
|
| 152 |
+
|
| 153 |
+
def get_context_dict(key: str, default: dict = None) -> dict:
|
| 154 |
+
return self.node.try_get_context(key) or default
|
| 155 |
+
|
| 156 |
+
def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
|
| 157 |
+
ctx_value = self.node.try_get_context(key)
|
| 158 |
+
if not isinstance(ctx_value, list):
|
| 159 |
+
print(
|
| 160 |
+
f"Warning: Context key '{key}' not found or not a list. Returning empty list."
|
| 161 |
+
)
|
| 162 |
+
return []
|
| 163 |
+
# Optional: Add validation that all items in the list are dicts
|
| 164 |
+
return ctx_value
|
| 165 |
+
|
| 166 |
+
self.template_options.description = "Deployment of the 'doc_redaction' PDF, image, and XLSX/CSV redaction app. Git repo available at: https://github.com/seanpedrick-case/doc_redaction."
|
| 167 |
+
|
| 168 |
+
# --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
|
| 169 |
+
new_vpc_created = False
|
| 170 |
+
if VPC_NAME:
|
| 171 |
+
vpc_id = get_context_str("vpc_id")
|
| 172 |
+
if not vpc_id:
|
| 173 |
+
raise ValueError(
|
| 174 |
+
f"VPC '{VPC_NAME}' was not resolved during pre-check (missing "
|
| 175 |
+
"'vpc_id' in context). Re-run from the cdk/ directory so "
|
| 176 |
+
"precheck.context.json is generated."
|
| 177 |
+
)
|
| 178 |
+
availability_zones = list(
|
| 179 |
+
dict.fromkeys(
|
| 180 |
+
(PUBLIC_SUBNET_AVAILABILITY_ZONES or [])
|
| 181 |
+
+ (PRIVATE_SUBNET_AVAILABILITY_ZONES or [])
|
| 182 |
+
)
|
| 183 |
+
)
|
| 184 |
+
if not availability_zones:
|
| 185 |
+
raise ValueError(
|
| 186 |
+
"vpc_id is in context but no subnet availability zones are "
|
| 187 |
+
"configured. Set PUBLIC_SUBNET_AVAILABILITY_ZONES and/or "
|
| 188 |
+
"PRIVATE_SUBNET_AVAILABILITY_ZONES in cdk_config.env."
|
| 189 |
+
)
|
| 190 |
+
vpc = ec2.Vpc.from_vpc_attributes(
|
| 191 |
+
self,
|
| 192 |
+
"VPC",
|
| 193 |
+
vpc_id=vpc_id,
|
| 194 |
+
availability_zones=availability_zones,
|
| 195 |
+
)
|
| 196 |
+
print(f"Using VPC from pre-check context: {vpc_id}")
|
| 197 |
+
|
| 198 |
+
elif NEW_VPC_DEFAULT_NAME:
|
| 199 |
+
new_vpc_created = True
|
| 200 |
+
print(
|
| 201 |
+
f"NEW_VPC_DEFAULT_NAME ('{NEW_VPC_DEFAULT_NAME}') is set. Creating a new VPC."
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Configuration for the new VPC
|
| 205 |
+
# You can make these configurable via context as well, e.g.,
|
| 206 |
+
# new_vpc_cidr = self.node.try_get_context("new_vpc_cidr") or "10.0.0.0/24"
|
| 207 |
+
# new_vpc_max_azs = self.node.try_get_context("new_vpc_max_azs") or 2 # Use 2 AZs by default for HA
|
| 208 |
+
# new_vpc_nat_gateways = self.node.try_get_context("new_vpc_nat_gateways") or new_vpc_max_azs # One NAT GW per AZ for HA
|
| 209 |
+
# or 1 for cost savings if acceptable
|
| 210 |
+
if not NEW_VPC_CIDR:
|
| 211 |
+
raise Exception(
|
| 212 |
+
"App has been instructed to create a new VPC but not VPC CDR range provided to variable NEW_VPC_CIDR"
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
print("Provided NEW_VPC_CIDR range:", NEW_VPC_CIDR)
|
| 216 |
+
|
| 217 |
+
new_vpc_cidr = NEW_VPC_CIDR
|
| 218 |
+
new_vpc_max_azs = 2 # Creates resources in 2 AZs. Adjust as needed.
|
| 219 |
+
|
| 220 |
+
# For "a NAT gateway", you can set nat_gateways=1.
|
| 221 |
+
# For resilience (NAT GW per AZ), set nat_gateways=new_vpc_max_azs.
|
| 222 |
+
# The Vpc construct will create NAT Gateway(s) if subnet_type PRIVATE_WITH_EGRESS is used
|
| 223 |
+
# and nat_gateways > 0.
|
| 224 |
+
new_vpc_nat_gateways = (
|
| 225 |
+
1 # Creates a single NAT Gateway for cost-effectiveness.
|
| 226 |
+
)
|
| 227 |
+
# If you need one per AZ for higher availability, set this to new_vpc_max_azs.
|
| 228 |
+
|
| 229 |
+
vpc = ec2.Vpc(
|
| 230 |
+
self,
|
| 231 |
+
"MyNewLogicalVpc", # This is the CDK construct ID
|
| 232 |
+
vpc_name=NEW_VPC_DEFAULT_NAME,
|
| 233 |
+
ip_addresses=ec2.IpAddresses.cidr(new_vpc_cidr),
|
| 234 |
+
max_azs=new_vpc_max_azs,
|
| 235 |
+
nat_gateways=new_vpc_nat_gateways, # Number of NAT gateways to create
|
| 236 |
+
subnet_configuration=[
|
| 237 |
+
ec2.SubnetConfiguration(
|
| 238 |
+
name="Public", # Name prefix for public subnets
|
| 239 |
+
subnet_type=ec2.SubnetType.PUBLIC,
|
| 240 |
+
cidr_mask=28, # Adjust CIDR mask as needed (e.g., /24 provides ~250 IPs per subnet)
|
| 241 |
+
),
|
| 242 |
+
ec2.SubnetConfiguration(
|
| 243 |
+
name="Private", # Name prefix for private subnets
|
| 244 |
+
subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, # Ensures these subnets have NAT Gateway access
|
| 245 |
+
cidr_mask=28, # Adjust CIDR mask as needed
|
| 246 |
+
),
|
| 247 |
+
# You could also add ec2.SubnetType.PRIVATE_ISOLATED if needed
|
| 248 |
+
],
|
| 249 |
+
# Internet Gateway is created and configured automatically for PUBLIC subnets.
|
| 250 |
+
# Route tables for public subnets will point to the IGW.
|
| 251 |
+
# Route tables for PRIVATE_WITH_EGRESS subnets will point to the NAT Gateway(s).
|
| 252 |
+
)
|
| 253 |
+
print(
|
| 254 |
+
f"Successfully created new VPC: {vpc.vpc_id} with name '{NEW_VPC_DEFAULT_NAME}'"
|
| 255 |
+
)
|
| 256 |
+
# If nat_gateways > 0, vpc.nat_gateway_ips will contain EIPs if Vpc created them.
|
| 257 |
+
# vpc.public_subnets, vpc.private_subnets, vpc.isolated_subnets are populated.
|
| 258 |
+
|
| 259 |
+
else:
|
| 260 |
+
raise Exception(
|
| 261 |
+
"VPC_NAME for current VPC not found, and NEW_VPC_DEFAULT_NAME not found to create a new VPC"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# --- Subnet Handling (Check Context and Create/Import) ---
|
| 265 |
+
# Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
|
| 266 |
+
# We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
|
| 267 |
+
self.public_subnets: List[ec2.ISubnet] = []
|
| 268 |
+
self.private_subnets: List[ec2.ISubnet] = []
|
| 269 |
+
# Store L1 CfnRouteTables explicitly if you need to reference them later
|
| 270 |
+
self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
|
| 271 |
+
self.public_route_tables_cfn: List[ec2.CfnRouteTable] = (
|
| 272 |
+
[]
|
| 273 |
+
) # New: to store public RTs
|
| 274 |
+
|
| 275 |
+
names_to_create_private = []
|
| 276 |
+
names_to_create_public = []
|
| 277 |
+
|
| 278 |
+
if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
|
| 279 |
+
print(
|
| 280 |
+
"Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets."
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
print("vpc.public_subnets:", vpc.public_subnets)
|
| 284 |
+
print("vpc.private_subnets:", vpc.private_subnets)
|
| 285 |
+
|
| 286 |
+
if (
|
| 287 |
+
vpc.public_subnets
|
| 288 |
+
): # These are already one_per_az if max_azs was used and Vpc created them
|
| 289 |
+
self.public_subnets.extend(vpc.public_subnets)
|
| 290 |
+
else:
|
| 291 |
+
self.node.add_warning("No public subnets found in the VPC.")
|
| 292 |
+
|
| 293 |
+
# Get private subnets with egress specifically
|
| 294 |
+
# selected_private_subnets_with_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)
|
| 295 |
+
|
| 296 |
+
print(
|
| 297 |
+
f"Selected from VPC: {len(self.public_subnets)} public, {len(self.private_subnets)} private_with_egress subnets."
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
if (
|
| 301 |
+
len(self.public_subnets) < 1 or len(self.private_subnets) < 1
|
| 302 |
+
): # Simplified check for new VPC
|
| 303 |
+
# If new_vpc_max_azs was 1, you'd have 1 of each. If 2, then 2 of each.
|
| 304 |
+
# The original check ' < 2' might be too strict if new_vpc_max_azs=1
|
| 305 |
+
pass # For new VPC, allow single AZ setups if configured that way. The VPC construct ensures one per AZ up to max_azs.
|
| 306 |
+
|
| 307 |
+
if not self.public_subnets and not self.private_subnets:
|
| 308 |
+
print(
|
| 309 |
+
"Error: No public or private subnets could be found in the VPC for automatic selection. "
|
| 310 |
+
"You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
|
| 311 |
+
)
|
| 312 |
+
raise RuntimeError("No suitable subnets found for automatic selection.")
|
| 313 |
+
else:
|
| 314 |
+
print(
|
| 315 |
+
f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC properties."
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
selected_public_subnets = vpc.select_subnets(
|
| 319 |
+
subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True
|
| 320 |
+
)
|
| 321 |
+
private_subnets_egress = vpc.select_subnets(
|
| 322 |
+
subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
if private_subnets_egress.subnets:
|
| 326 |
+
self.private_subnets.extend(private_subnets_egress.subnets)
|
| 327 |
+
else:
|
| 328 |
+
self.node.add_warning(
|
| 329 |
+
"No PRIVATE_WITH_EGRESS subnets found in the VPC."
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
try:
|
| 333 |
+
private_subnets_isolated = vpc.select_subnets(
|
| 334 |
+
subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True
|
| 335 |
+
)
|
| 336 |
+
except Exception as e:
|
| 337 |
+
private_subnets_isolated = []
|
| 338 |
+
print("Could not find any isolated subnets due to:", e)
|
| 339 |
+
|
| 340 |
+
###
|
| 341 |
+
combined_subnet_objects = []
|
| 342 |
+
|
| 343 |
+
if private_subnets_isolated:
|
| 344 |
+
if private_subnets_egress.subnets:
|
| 345 |
+
# Add the first PRIVATE_WITH_EGRESS subnet
|
| 346 |
+
combined_subnet_objects.append(private_subnets_egress.subnets[0])
|
| 347 |
+
elif not private_subnets_isolated:
|
| 348 |
+
if private_subnets_egress.subnets:
|
| 349 |
+
# Add the first PRIVATE_WITH_EGRESS subnet
|
| 350 |
+
combined_subnet_objects.extend(private_subnets_egress.subnets)
|
| 351 |
+
else:
|
| 352 |
+
self.node.add_warning(
|
| 353 |
+
"No PRIVATE_WITH_EGRESS subnets found to select the first one."
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
|
| 357 |
+
try:
|
| 358 |
+
if len(private_subnets_isolated.subnets) > 1:
|
| 359 |
+
combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
|
| 360 |
+
elif (
|
| 361 |
+
private_subnets_isolated.subnets
|
| 362 |
+
): # Only 1 isolated subnet, add a warning if [1:] was desired
|
| 363 |
+
self.node.add_warning(
|
| 364 |
+
"Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty."
|
| 365 |
+
)
|
| 366 |
+
else:
|
| 367 |
+
self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
|
| 368 |
+
except Exception as e:
|
| 369 |
+
print("Could not identify private isolated subnets due to:", e)
|
| 370 |
+
|
| 371 |
+
# Create an ec2.SelectedSubnets object from the combined private subnet list.
|
| 372 |
+
selected_private_subnets = vpc.select_subnets(
|
| 373 |
+
subnets=combined_subnet_objects
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
print("selected_public_subnets:", selected_public_subnets)
|
| 377 |
+
print("selected_private_subnets:", selected_private_subnets)
|
| 378 |
+
|
| 379 |
+
if (
|
| 380 |
+
len(selected_public_subnets.subnet_ids) < 2
|
| 381 |
+
or len(selected_private_subnets.subnet_ids) < 2
|
| 382 |
+
):
|
| 383 |
+
raise Exception(
|
| 384 |
+
"Need at least two public or private subnets in different availability zones"
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
if not selected_public_subnets and not selected_private_subnets:
|
| 388 |
+
# If no subnets could be found even with automatic selection, raise an error.
|
| 389 |
+
# This ensures the stack doesn't proceed if it absolutely needs subnets.
|
| 390 |
+
print(
|
| 391 |
+
"Error: No existing public or private subnets could be found in the VPC for automatic selection. "
|
| 392 |
+
"You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
|
| 393 |
+
)
|
| 394 |
+
raise RuntimeError("No suitable subnets found for automatic selection.")
|
| 395 |
+
else:
|
| 396 |
+
self.public_subnets = selected_public_subnets.subnets
|
| 397 |
+
self.private_subnets = selected_private_subnets.subnets
|
| 398 |
+
print(
|
| 399 |
+
f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery."
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
print("self.public_subnets:", self.public_subnets)
|
| 403 |
+
print("self.private_subnets:", self.private_subnets)
|
| 404 |
+
# Since subnets are now assigned, we can exit this processing block.
|
| 405 |
+
# The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
|
| 406 |
+
|
| 407 |
+
checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
|
| 408 |
+
checked_private_subnets_ctx = get_context_dict("checked_private_subnets")
|
| 409 |
+
|
| 410 |
+
public_subnets_data_for_creation_ctx = get_context_list_of_dicts(
|
| 411 |
+
"public_subnets_to_create"
|
| 412 |
+
)
|
| 413 |
+
private_subnets_data_for_creation_ctx = get_context_list_of_dicts(
|
| 414 |
+
"private_subnets_to_create"
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
# --- 3. Process Public Subnets ---
|
| 418 |
+
print("\n--- Processing Public Subnets ---")
|
| 419 |
+
# Import existing public subnets
|
| 420 |
+
if checked_public_subnets_ctx:
|
| 421 |
+
for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
|
| 422 |
+
subnet_info = checked_public_subnets_ctx.get(subnet_name)
|
| 423 |
+
if subnet_info and subnet_info.get("exists"):
|
| 424 |
+
subnet_id = subnet_info.get("id")
|
| 425 |
+
if not subnet_id:
|
| 426 |
+
raise RuntimeError(
|
| 427 |
+
f"Context for existing public subnet '{subnet_name}' is missing 'id'."
|
| 428 |
+
)
|
| 429 |
+
subnet_az = subnet_info.get("az")
|
| 430 |
+
if (
|
| 431 |
+
not subnet_az
|
| 432 |
+
and PUBLIC_SUBNET_AVAILABILITY_ZONES
|
| 433 |
+
and i < len(PUBLIC_SUBNET_AVAILABILITY_ZONES)
|
| 434 |
+
):
|
| 435 |
+
subnet_az = PUBLIC_SUBNET_AVAILABILITY_ZONES[i]
|
| 436 |
+
if not subnet_az:
|
| 437 |
+
raise RuntimeError(
|
| 438 |
+
f"Context for existing public subnet '{subnet_name}' is missing 'az'."
|
| 439 |
+
)
|
| 440 |
+
subnet_attrs = {
|
| 441 |
+
"subnet_id": subnet_id,
|
| 442 |
+
"availability_zone": subnet_az,
|
| 443 |
+
}
|
| 444 |
+
route_table_id = subnet_info.get("route_table_id")
|
| 445 |
+
if route_table_id:
|
| 446 |
+
subnet_attrs["route_table_id"] = route_table_id
|
| 447 |
+
try:
|
| 448 |
+
imported_subnet = ec2.Subnet.from_subnet_attributes(
|
| 449 |
+
self,
|
| 450 |
+
f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}",
|
| 451 |
+
**subnet_attrs,
|
| 452 |
+
)
|
| 453 |
+
self.public_subnets.append(imported_subnet)
|
| 454 |
+
print(
|
| 455 |
+
f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})"
|
| 456 |
+
)
|
| 457 |
+
except Exception as e:
|
| 458 |
+
raise RuntimeError(
|
| 459 |
+
f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}"
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
# Create new public subnets based on public_subnets_data_for_creation_ctx
|
| 463 |
+
if public_subnets_data_for_creation_ctx:
|
| 464 |
+
names_to_create_public = [
|
| 465 |
+
s["name"] for s in public_subnets_data_for_creation_ctx
|
| 466 |
+
]
|
| 467 |
+
cidrs_to_create_public = [
|
| 468 |
+
s["cidr"] for s in public_subnets_data_for_creation_ctx
|
| 469 |
+
]
|
| 470 |
+
azs_to_create_public = [
|
| 471 |
+
s["az"] for s in public_subnets_data_for_creation_ctx
|
| 472 |
+
]
|
| 473 |
+
|
| 474 |
+
if names_to_create_public:
|
| 475 |
+
print(
|
| 476 |
+
f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}"
|
| 477 |
+
)
|
| 478 |
+
newly_created_public_subnets, newly_created_public_rts_cfn = (
|
| 479 |
+
create_subnets(
|
| 480 |
+
self,
|
| 481 |
+
vpc,
|
| 482 |
+
CDK_PREFIX,
|
| 483 |
+
names_to_create_public,
|
| 484 |
+
cidrs_to_create_public,
|
| 485 |
+
azs_to_create_public,
|
| 486 |
+
is_public=True,
|
| 487 |
+
internet_gateway_id=EXISTING_IGW_ID,
|
| 488 |
+
)
|
| 489 |
+
)
|
| 490 |
+
self.public_subnets.extend(newly_created_public_subnets)
|
| 491 |
+
self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
|
| 492 |
+
|
| 493 |
+
if (
|
| 494 |
+
not self.public_subnets
|
| 495 |
+
and not names_to_create_public
|
| 496 |
+
and not PUBLIC_SUBNETS_TO_USE
|
| 497 |
+
):
|
| 498 |
+
raise Exception("No public subnets found or created, exiting.")
|
| 499 |
+
|
| 500 |
+
# --- NAT Gateway Creation/Lookup ---
|
| 501 |
+
print("Creating NAT gateway/located existing")
|
| 502 |
+
self.single_nat_gateway_id = None
|
| 503 |
+
|
| 504 |
+
nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID or get_context_str(
|
| 505 |
+
"id:NatGateway"
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
if nat_gw_id_from_context:
|
| 509 |
+
print(
|
| 510 |
+
f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}"
|
| 511 |
+
)
|
| 512 |
+
self.single_nat_gateway_id = nat_gw_id_from_context
|
| 513 |
+
|
| 514 |
+
elif (
|
| 515 |
+
new_vpc_created
|
| 516 |
+
and new_vpc_nat_gateways > 0
|
| 517 |
+
and hasattr(vpc, "nat_gateways")
|
| 518 |
+
and vpc.nat_gateways
|
| 519 |
+
):
|
| 520 |
+
self.single_nat_gateway_id = vpc.nat_gateways[0].gateway_id
|
| 521 |
+
print(
|
| 522 |
+
f"Using NAT Gateway {self.single_nat_gateway_id} created by the new VPC construct."
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
if not self.single_nat_gateway_id:
|
| 526 |
+
print("Creating a new NAT gateway")
|
| 527 |
+
|
| 528 |
+
if hasattr(vpc, "nat_gateways") and vpc.nat_gateways:
|
| 529 |
+
print("Existing NAT gateway found in vpc")
|
| 530 |
+
pass
|
| 531 |
+
|
| 532 |
+
# If not in context, create a new one, but only if we have a public subnet.
|
| 533 |
+
elif self.public_subnets:
|
| 534 |
+
print("NAT Gateway ID not found in context. Creating a new one.")
|
| 535 |
+
# Place the NAT GW in the first available public subnet
|
| 536 |
+
first_public_subnet = self.public_subnets[0]
|
| 537 |
+
|
| 538 |
+
self.single_nat_gateway_id = create_nat_gateway(
|
| 539 |
+
self,
|
| 540 |
+
first_public_subnet,
|
| 541 |
+
nat_gateway_name=NAT_GATEWAY_NAME,
|
| 542 |
+
nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID,
|
| 543 |
+
)
|
| 544 |
+
else:
|
| 545 |
+
print(
|
| 546 |
+
"WARNING: No public subnets available and NAT gateway not found in existing VPC. Cannot create a NAT Gateway."
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# --- 4. Process Private Subnets ---
|
| 550 |
+
print("\n--- Processing Private Subnets ---")
|
| 551 |
+
if checked_private_subnets_ctx:
|
| 552 |
+
for i, subnet_name in enumerate(PRIVATE_SUBNETS_TO_USE):
|
| 553 |
+
subnet_info = checked_private_subnets_ctx.get(subnet_name)
|
| 554 |
+
if subnet_info and subnet_info.get("exists"):
|
| 555 |
+
subnet_id = subnet_info.get("id")
|
| 556 |
+
if not subnet_id:
|
| 557 |
+
raise RuntimeError(
|
| 558 |
+
f"Context for existing private subnet '{subnet_name}' is missing 'id'."
|
| 559 |
+
)
|
| 560 |
+
subnet_az = subnet_info.get("az")
|
| 561 |
+
if (
|
| 562 |
+
not subnet_az
|
| 563 |
+
and PRIVATE_SUBNET_AVAILABILITY_ZONES
|
| 564 |
+
and i < len(PRIVATE_SUBNET_AVAILABILITY_ZONES)
|
| 565 |
+
):
|
| 566 |
+
subnet_az = PRIVATE_SUBNET_AVAILABILITY_ZONES[i]
|
| 567 |
+
if not subnet_az:
|
| 568 |
+
raise RuntimeError(
|
| 569 |
+
f"Context for existing private subnet '{subnet_name}' is missing 'az'."
|
| 570 |
+
)
|
| 571 |
+
subnet_attrs = {
|
| 572 |
+
"subnet_id": subnet_id,
|
| 573 |
+
"availability_zone": subnet_az,
|
| 574 |
+
}
|
| 575 |
+
route_table_id = subnet_info.get("route_table_id")
|
| 576 |
+
if route_table_id:
|
| 577 |
+
subnet_attrs["route_table_id"] = route_table_id
|
| 578 |
+
try:
|
| 579 |
+
imported_subnet = ec2.Subnet.from_subnet_attributes(
|
| 580 |
+
self,
|
| 581 |
+
f"ImportedPrivateSubnet{subnet_name.replace('-', '')}{i}",
|
| 582 |
+
**subnet_attrs,
|
| 583 |
+
)
|
| 584 |
+
self.private_subnets.append(imported_subnet)
|
| 585 |
+
print(
|
| 586 |
+
f"Imported existing private subnet: {subnet_name} (ID: {subnet_id})"
|
| 587 |
+
)
|
| 588 |
+
except Exception as e:
|
| 589 |
+
raise RuntimeError(
|
| 590 |
+
f"Failed to import private subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}"
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
# Create new private subnets
|
| 594 |
+
if private_subnets_data_for_creation_ctx:
|
| 595 |
+
names_to_create_private = [
|
| 596 |
+
s["name"] for s in private_subnets_data_for_creation_ctx
|
| 597 |
+
]
|
| 598 |
+
cidrs_to_create_private = [
|
| 599 |
+
s["cidr"] for s in private_subnets_data_for_creation_ctx
|
| 600 |
+
]
|
| 601 |
+
azs_to_create_private = [
|
| 602 |
+
s["az"] for s in private_subnets_data_for_creation_ctx
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
if names_to_create_private:
|
| 606 |
+
print(
|
| 607 |
+
f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}"
|
| 608 |
+
)
|
| 609 |
+
# --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
|
| 610 |
+
# Ensure self.single_nat_gateway_id is available before this call
|
| 611 |
+
if not self.single_nat_gateway_id:
|
| 612 |
+
raise ValueError(
|
| 613 |
+
"A single NAT Gateway ID is required for private subnets but was not resolved."
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
newly_created_private_subnets_cfn, newly_created_private_rts_cfn = (
|
| 617 |
+
create_subnets(
|
| 618 |
+
self,
|
| 619 |
+
vpc,
|
| 620 |
+
CDK_PREFIX,
|
| 621 |
+
names_to_create_private,
|
| 622 |
+
cidrs_to_create_private,
|
| 623 |
+
azs_to_create_private,
|
| 624 |
+
is_public=False,
|
| 625 |
+
single_nat_gateway_id=self.single_nat_gateway_id, # Pass the single NAT Gateway ID
|
| 626 |
+
)
|
| 627 |
+
)
|
| 628 |
+
self.private_subnets.extend(newly_created_private_subnets_cfn)
|
| 629 |
+
self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
|
| 630 |
+
print(
|
| 631 |
+
f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation."
|
| 632 |
+
)
|
| 633 |
+
else:
|
| 634 |
+
print(
|
| 635 |
+
"No private subnets specified for creation in context ('private_subnets_to_create')."
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
# if not self.private_subnets:
|
| 639 |
+
# raise Exception("No private subnets found or created, exiting.")
|
| 640 |
+
|
| 641 |
+
if (
|
| 642 |
+
not self.private_subnets
|
| 643 |
+
and not names_to_create_private
|
| 644 |
+
and not PRIVATE_SUBNETS_TO_USE
|
| 645 |
+
):
|
| 646 |
+
# This condition might need adjustment for new VPCs.
|
| 647 |
+
raise Exception("No private subnets found or created, exiting.")
|
| 648 |
+
|
| 649 |
+
# --- 5. Sanity Check and Output ---
|
| 650 |
+
# Output the single NAT Gateway ID for verification
|
| 651 |
+
if self.single_nat_gateway_id:
|
| 652 |
+
CfnOutput(
|
| 653 |
+
self,
|
| 654 |
+
"SingleNatGatewayId",
|
| 655 |
+
value=self.single_nat_gateway_id,
|
| 656 |
+
description="ID of the single NAT Gateway resolved or created.",
|
| 657 |
+
)
|
| 658 |
+
elif (
|
| 659 |
+
NEW_VPC_DEFAULT_NAME
|
| 660 |
+
and (self.node.try_get_context("new_vpc_nat_gateways") or 1) > 0
|
| 661 |
+
):
|
| 662 |
+
print(
|
| 663 |
+
"INFO: A new VPC was created with NAT Gateway(s). Their routing is handled by the VPC construct. No single_nat_gateway_id was explicitly set for separate output."
|
| 664 |
+
)
|
| 665 |
+
else:
|
| 666 |
+
out_message = "WARNING: No single NAT Gateway was resolved or created explicitly by the script's logic after VPC setup."
|
| 667 |
+
print(out_message)
|
| 668 |
+
raise Exception(out_message)
|
| 669 |
+
|
| 670 |
+
# --- Outputs for other stacks/regions ---
|
| 671 |
+
# These are crucial for cross-stack, cross-region referencing
|
| 672 |
+
|
| 673 |
+
self.params = dict()
|
| 674 |
+
self.params["vpc_id"] = vpc.vpc_id
|
| 675 |
+
self.params["private_subnets"] = self.private_subnets
|
| 676 |
+
self.params["private_route_tables"] = self.private_route_tables_cfn
|
| 677 |
+
self.params["public_subnets"] = self.public_subnets
|
| 678 |
+
self.params["public_route_tables"] = self.public_route_tables_cfn
|
| 679 |
+
|
| 680 |
+
private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
|
| 681 |
+
public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
|
| 682 |
+
|
| 683 |
+
for sub in private_subnet_selection.subnets:
|
| 684 |
+
print(
|
| 685 |
+
"private subnet:",
|
| 686 |
+
sub.subnet_id,
|
| 687 |
+
"is in availability zone:",
|
| 688 |
+
sub.availability_zone,
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
for sub in public_subnet_selection.subnets:
|
| 692 |
+
print(
|
| 693 |
+
"public subnet:",
|
| 694 |
+
sub.subnet_id,
|
| 695 |
+
"is in availability zone:",
|
| 696 |
+
sub.availability_zone,
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
print("Private subnet route tables:", self.private_route_tables_cfn)
|
| 700 |
+
|
| 701 |
+
# Add the S3 Gateway Endpoint to the VPC
|
| 702 |
+
if names_to_create_private:
|
| 703 |
+
try:
|
| 704 |
+
s3_gateway_endpoint = vpc.add_gateway_endpoint(
|
| 705 |
+
"S3GatewayEndpoint",
|
| 706 |
+
service=ec2.GatewayVpcEndpointAwsService.S3,
|
| 707 |
+
subnets=[private_subnet_selection],
|
| 708 |
+
)
|
| 709 |
+
except Exception as e:
|
| 710 |
+
print("Could not add S3 gateway endpoint to subnets due to:", e)
|
| 711 |
+
|
| 712 |
+
# Output some useful information
|
| 713 |
+
CfnOutput(
|
| 714 |
+
self,
|
| 715 |
+
"VpcIdOutput",
|
| 716 |
+
value=vpc.vpc_id,
|
| 717 |
+
description="The ID of the VPC where the S3 Gateway Endpoint is deployed.",
|
| 718 |
+
)
|
| 719 |
+
CfnOutput(
|
| 720 |
+
self,
|
| 721 |
+
"S3GatewayEndpointService",
|
| 722 |
+
value=s3_gateway_endpoint.vpc_endpoint_id,
|
| 723 |
+
description="The id for the S3 Gateway Endpoint.",
|
| 724 |
+
) # Specify the S3 service
|
| 725 |
+
|
| 726 |
+
# --- IAM Roles ---
|
| 727 |
+
if USE_CUSTOM_KMS_KEY == "1":
|
| 728 |
+
kms_key = kms.Key(
|
| 729 |
+
self,
|
| 730 |
+
"RedactionSharedKmsKey",
|
| 731 |
+
alias=CUSTOM_KMS_KEY_NAME,
|
| 732 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 733 |
+
)
|
| 734 |
+
|
| 735 |
+
custom_sts_kms_policy_dict = {
|
| 736 |
+
"Version": "2012-10-17",
|
| 737 |
+
"Statement": [
|
| 738 |
+
{
|
| 739 |
+
"Sid": "STSCallerIdentity",
|
| 740 |
+
"Effect": "Allow",
|
| 741 |
+
"Action": ["sts:GetCallerIdentity"],
|
| 742 |
+
"Resource": "*",
|
| 743 |
+
},
|
| 744 |
+
{
|
| 745 |
+
"Sid": "KMSAccess",
|
| 746 |
+
"Effect": "Allow",
|
| 747 |
+
"Action": ["kms:Encrypt", "kms:Decrypt", "kms:GenerateDataKey"],
|
| 748 |
+
"Resource": kms_key.key_arn, # Use key_arn, as it's the full ARN, safer than key_id
|
| 749 |
+
},
|
| 750 |
+
],
|
| 751 |
+
}
|
| 752 |
+
else:
|
| 753 |
+
kms_key = None
|
| 754 |
+
|
| 755 |
+
custom_sts_kms_policy_dict = {
|
| 756 |
+
"Version": "2012-10-17",
|
| 757 |
+
"Statement": [
|
| 758 |
+
{
|
| 759 |
+
"Sid": "STSCallerIdentity",
|
| 760 |
+
"Effect": "Allow",
|
| 761 |
+
"Action": ["sts:GetCallerIdentity"],
|
| 762 |
+
"Resource": "*",
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"Sid": "KMSSecretsManagerDecrypt", # Explicitly add decrypt for default key
|
| 766 |
+
"Effect": "Allow",
|
| 767 |
+
"Action": ["kms:Decrypt"],
|
| 768 |
+
"Resource": f"arn:aws:kms:{AWS_REGION}:{AWS_ACCOUNT_ID}:key/aws/secretsmanager",
|
| 769 |
+
},
|
| 770 |
+
],
|
| 771 |
+
}
|
| 772 |
+
custom_sts_kms_policy = json.dumps(custom_sts_kms_policy_dict, indent=4)
|
| 773 |
+
|
| 774 |
+
try:
|
| 775 |
+
codebuild_role_name = CODEBUILD_ROLE_NAME
|
| 776 |
+
|
| 777 |
+
if get_context_bool(f"exists:{codebuild_role_name}"):
|
| 778 |
+
# If exists, lookup/import the role using ARN from context
|
| 779 |
+
role_arn = get_context_str(f"arn:{codebuild_role_name}")
|
| 780 |
+
if not role_arn:
|
| 781 |
+
raise ValueError(
|
| 782 |
+
f"Context value 'arn:{codebuild_role_name}' is required if role exists."
|
| 783 |
+
)
|
| 784 |
+
codebuild_role = iam.Role.from_role_arn(
|
| 785 |
+
self, "CodeBuildRole", role_arn=role_arn
|
| 786 |
+
)
|
| 787 |
+
print("Using existing CodeBuild role")
|
| 788 |
+
else:
|
| 789 |
+
# If not exists, create the role
|
| 790 |
+
codebuild_role = iam.Role(
|
| 791 |
+
self,
|
| 792 |
+
"CodeBuildRole", # Logical ID
|
| 793 |
+
role_name=codebuild_role_name, # Explicit resource name
|
| 794 |
+
assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com"),
|
| 795 |
+
)
|
| 796 |
+
codebuild_role.add_managed_policy(
|
| 797 |
+
iam.ManagedPolicy.from_aws_managed_policy_name(
|
| 798 |
+
"EC2InstanceProfileForImageBuilderECRContainerBuilds"
|
| 799 |
+
)
|
| 800 |
+
)
|
| 801 |
+
print("Successfully created new CodeBuild role")
|
| 802 |
+
|
| 803 |
+
task_role_name = ECS_TASK_ROLE_NAME
|
| 804 |
+
if get_context_bool(f"exists:{task_role_name}"):
|
| 805 |
+
role_arn = get_context_str(f"arn:{task_role_name}")
|
| 806 |
+
if not role_arn:
|
| 807 |
+
raise ValueError(
|
| 808 |
+
f"Context value 'arn:{task_role_name}' is required if role exists."
|
| 809 |
+
)
|
| 810 |
+
task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
|
| 811 |
+
print("Using existing ECS task role")
|
| 812 |
+
else:
|
| 813 |
+
task_role = iam.Role(
|
| 814 |
+
self,
|
| 815 |
+
"TaskRole", # Logical ID
|
| 816 |
+
role_name=task_role_name, # Explicit resource name
|
| 817 |
+
assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
|
| 818 |
+
)
|
| 819 |
+
for role in AWS_MANAGED_TASK_ROLES_LIST:
|
| 820 |
+
print(f"Adding {role} to policy")
|
| 821 |
+
task_role.add_managed_policy(
|
| 822 |
+
iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
|
| 823 |
+
)
|
| 824 |
+
task_role = add_custom_policies(
|
| 825 |
+
self, task_role, custom_policy_text=custom_sts_kms_policy
|
| 826 |
+
)
|
| 827 |
+
print("Successfully created new ECS task role")
|
| 828 |
+
|
| 829 |
+
execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
|
| 830 |
+
if get_context_bool(f"exists:{execution_role_name}"):
|
| 831 |
+
role_arn = get_context_str(f"arn:{execution_role_name}")
|
| 832 |
+
if not role_arn:
|
| 833 |
+
raise ValueError(
|
| 834 |
+
f"Context value 'arn:{execution_role_name}' is required if role exists."
|
| 835 |
+
)
|
| 836 |
+
execution_role = iam.Role.from_role_arn(
|
| 837 |
+
self, "ExecutionRole", role_arn=role_arn
|
| 838 |
+
)
|
| 839 |
+
print("Using existing ECS execution role")
|
| 840 |
+
else:
|
| 841 |
+
execution_role = iam.Role(
|
| 842 |
+
self,
|
| 843 |
+
"ExecutionRole", # Logical ID
|
| 844 |
+
role_name=execution_role_name, # Explicit resource name
|
| 845 |
+
assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
|
| 846 |
+
)
|
| 847 |
+
for role in AWS_MANAGED_TASK_ROLES_LIST:
|
| 848 |
+
execution_role.add_managed_policy(
|
| 849 |
+
iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
|
| 850 |
+
)
|
| 851 |
+
execution_role = add_custom_policies(
|
| 852 |
+
self, execution_role, custom_policy_text=custom_sts_kms_policy
|
| 853 |
+
)
|
| 854 |
+
print("Successfully created new ECS execution role")
|
| 855 |
+
|
| 856 |
+
except Exception as e:
|
| 857 |
+
raise Exception("Failed at IAM role step due to:", e)
|
| 858 |
+
|
| 859 |
+
# --- S3 Buckets ---
|
| 860 |
+
try:
|
| 861 |
+
log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
|
| 862 |
+
if get_context_bool(f"exists:{log_bucket_name}"):
|
| 863 |
+
bucket = s3.Bucket.from_bucket_name(
|
| 864 |
+
self, "LogConfigBucket", bucket_name=log_bucket_name
|
| 865 |
+
)
|
| 866 |
+
print("Using existing S3 bucket", log_bucket_name)
|
| 867 |
+
else:
|
| 868 |
+
if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
|
| 869 |
+
bucket = s3.Bucket(
|
| 870 |
+
self,
|
| 871 |
+
"LogConfigBucket",
|
| 872 |
+
bucket_name=log_bucket_name,
|
| 873 |
+
versioned=False,
|
| 874 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 875 |
+
auto_delete_objects=True,
|
| 876 |
+
encryption=s3.BucketEncryption.KMS,
|
| 877 |
+
encryption_key=kms_key,
|
| 878 |
+
)
|
| 879 |
+
else:
|
| 880 |
+
bucket = s3.Bucket(
|
| 881 |
+
self,
|
| 882 |
+
"LogConfigBucket",
|
| 883 |
+
bucket_name=log_bucket_name,
|
| 884 |
+
versioned=False,
|
| 885 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 886 |
+
auto_delete_objects=True,
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
print("Created S3 bucket", log_bucket_name)
|
| 890 |
+
|
| 891 |
+
# Add policies - this will apply to both created and imported buckets
|
| 892 |
+
# CDK handles idempotent policy additions
|
| 893 |
+
bucket.add_to_resource_policy(
|
| 894 |
+
iam.PolicyStatement(
|
| 895 |
+
effect=iam.Effect.ALLOW,
|
| 896 |
+
principals=[task_role], # Pass the role object directly
|
| 897 |
+
actions=["s3:GetObject", "s3:PutObject"],
|
| 898 |
+
resources=[f"{bucket.bucket_arn}/*"],
|
| 899 |
+
)
|
| 900 |
+
)
|
| 901 |
+
bucket.add_to_resource_policy(
|
| 902 |
+
iam.PolicyStatement(
|
| 903 |
+
effect=iam.Effect.ALLOW,
|
| 904 |
+
principals=[task_role],
|
| 905 |
+
actions=["s3:ListBucket"],
|
| 906 |
+
resources=[bucket.bucket_arn],
|
| 907 |
+
)
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
output_bucket_name = S3_OUTPUT_BUCKET_NAME
|
| 911 |
+
if get_context_bool(f"exists:{output_bucket_name}"):
|
| 912 |
+
output_bucket = s3.Bucket.from_bucket_name(
|
| 913 |
+
self, "OutputBucket", bucket_name=output_bucket_name
|
| 914 |
+
)
|
| 915 |
+
print("Using existing Output bucket", output_bucket_name)
|
| 916 |
+
else:
|
| 917 |
+
if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
|
| 918 |
+
output_bucket = s3.Bucket(
|
| 919 |
+
self,
|
| 920 |
+
"OutputBucket",
|
| 921 |
+
bucket_name=output_bucket_name,
|
| 922 |
+
lifecycle_rules=[
|
| 923 |
+
s3.LifecycleRule(
|
| 924 |
+
expiration=Duration.days(
|
| 925 |
+
int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
| 926 |
+
)
|
| 927 |
+
)
|
| 928 |
+
],
|
| 929 |
+
versioned=False,
|
| 930 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 931 |
+
auto_delete_objects=True,
|
| 932 |
+
encryption=s3.BucketEncryption.KMS,
|
| 933 |
+
encryption_key=kms_key,
|
| 934 |
+
)
|
| 935 |
+
else:
|
| 936 |
+
output_bucket = s3.Bucket(
|
| 937 |
+
self,
|
| 938 |
+
"OutputBucket",
|
| 939 |
+
bucket_name=output_bucket_name,
|
| 940 |
+
lifecycle_rules=[
|
| 941 |
+
s3.LifecycleRule(
|
| 942 |
+
expiration=Duration.days(
|
| 943 |
+
int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
| 944 |
+
)
|
| 945 |
+
)
|
| 946 |
+
],
|
| 947 |
+
versioned=False,
|
| 948 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 949 |
+
auto_delete_objects=True,
|
| 950 |
+
)
|
| 951 |
+
|
| 952 |
+
print("Created Output bucket:", output_bucket_name)
|
| 953 |
+
|
| 954 |
+
# Add policies to output bucket
|
| 955 |
+
output_bucket.add_to_resource_policy(
|
| 956 |
+
iam.PolicyStatement(
|
| 957 |
+
effect=iam.Effect.ALLOW,
|
| 958 |
+
principals=[task_role],
|
| 959 |
+
actions=["s3:GetObject", "s3:PutObject"],
|
| 960 |
+
resources=[f"{output_bucket.bucket_arn}/*"],
|
| 961 |
+
)
|
| 962 |
+
)
|
| 963 |
+
output_bucket.add_to_resource_policy(
|
| 964 |
+
iam.PolicyStatement(
|
| 965 |
+
effect=iam.Effect.ALLOW,
|
| 966 |
+
principals=[task_role],
|
| 967 |
+
actions=["s3:ListBucket"],
|
| 968 |
+
resources=[output_bucket.bucket_arn],
|
| 969 |
+
)
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
+
except Exception as e:
|
| 973 |
+
raise Exception("Could not handle S3 buckets due to:", e)
|
| 974 |
+
|
| 975 |
+
# --- Elastic Container Registry ---
|
| 976 |
+
try:
|
| 977 |
+
full_ecr_repo_name = ECR_CDK_REPO_NAME
|
| 978 |
+
if get_context_bool(f"exists:{full_ecr_repo_name}"):
|
| 979 |
+
ecr_repo = ecr.Repository.from_repository_name(
|
| 980 |
+
self, "ECRRepo", repository_name=full_ecr_repo_name
|
| 981 |
+
)
|
| 982 |
+
print("Using existing ECR repository")
|
| 983 |
+
else:
|
| 984 |
+
ecr_repo = ecr.Repository(
|
| 985 |
+
self, "ECRRepo", repository_name=full_ecr_repo_name
|
| 986 |
+
) # Explicitly set repository_name
|
| 987 |
+
print("Created ECR repository", full_ecr_repo_name)
|
| 988 |
+
|
| 989 |
+
ecr_image_loc = ecr_repo.repository_uri
|
| 990 |
+
except Exception as e:
|
| 991 |
+
raise Exception("Could not handle ECR repo due to:", e)
|
| 992 |
+
|
| 993 |
+
# --- CODEBUILD ---
|
| 994 |
+
try:
|
| 995 |
+
codebuild_project_name = CODEBUILD_PROJECT_NAME
|
| 996 |
+
if get_context_bool(f"exists:{codebuild_project_name}"):
|
| 997 |
+
# Lookup CodeBuild project by ARN from context
|
| 998 |
+
project_arn = get_context_str(f"arn:{codebuild_project_name}")
|
| 999 |
+
if not project_arn:
|
| 1000 |
+
raise ValueError(
|
| 1001 |
+
f"Context value 'arn:{codebuild_project_name}' is required if project exists."
|
| 1002 |
+
)
|
| 1003 |
+
codebuild.Project.from_project_arn(
|
| 1004 |
+
self, "CodeBuildProject", project_arn=project_arn
|
| 1005 |
+
)
|
| 1006 |
+
print("Using existing CodeBuild project")
|
| 1007 |
+
else:
|
| 1008 |
+
codebuild.Project(
|
| 1009 |
+
self,
|
| 1010 |
+
"CodeBuildProject", # Logical ID
|
| 1011 |
+
project_name=codebuild_project_name, # Explicit resource name
|
| 1012 |
+
role=codebuild_role,
|
| 1013 |
+
source=codebuild.Source.git_hub(
|
| 1014 |
+
owner=GITHUB_REPO_USERNAME,
|
| 1015 |
+
repo=GITHUB_REPO_NAME,
|
| 1016 |
+
branch_or_ref=GITHUB_REPO_BRANCH,
|
| 1017 |
+
),
|
| 1018 |
+
environment=codebuild.BuildEnvironment(
|
| 1019 |
+
build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
|
| 1020 |
+
privileged=True,
|
| 1021 |
+
environment_variables={
|
| 1022 |
+
"ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(
|
| 1023 |
+
value=full_ecr_repo_name
|
| 1024 |
+
),
|
| 1025 |
+
"AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(
|
| 1026 |
+
value=AWS_REGION
|
| 1027 |
+
),
|
| 1028 |
+
"AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(
|
| 1029 |
+
value=AWS_ACCOUNT_ID
|
| 1030 |
+
),
|
| 1031 |
+
"APP_MODE": codebuild.BuildEnvironmentVariable(
|
| 1032 |
+
value="gradio"
|
| 1033 |
+
),
|
| 1034 |
+
},
|
| 1035 |
+
),
|
| 1036 |
+
build_spec=codebuild.BuildSpec.from_object(
|
| 1037 |
+
{
|
| 1038 |
+
"version": "0.2",
|
| 1039 |
+
"phases": {
|
| 1040 |
+
"pre_build": {
|
| 1041 |
+
"commands": [
|
| 1042 |
+
"echo Logging in to Amazon ECR",
|
| 1043 |
+
"aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com",
|
| 1044 |
+
]
|
| 1045 |
+
},
|
| 1046 |
+
"build": {
|
| 1047 |
+
"commands": [
|
| 1048 |
+
"echo Building the Docker image",
|
| 1049 |
+
"docker build --build-args APP_MODE=$APP_MODE --target $APP_MODE -t $ECR_REPO_NAME:latest .",
|
| 1050 |
+
"docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
|
| 1051 |
+
]
|
| 1052 |
+
},
|
| 1053 |
+
"post_build": {
|
| 1054 |
+
"commands": [
|
| 1055 |
+
"echo Pushing the Docker image",
|
| 1056 |
+
"docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
|
| 1057 |
+
]
|
| 1058 |
+
},
|
| 1059 |
+
},
|
| 1060 |
+
}
|
| 1061 |
+
),
|
| 1062 |
+
)
|
| 1063 |
+
print("Successfully created CodeBuild project", codebuild_project_name)
|
| 1064 |
+
|
| 1065 |
+
# Imported projects have role=undefined in CDK; use the actual service
|
| 1066 |
+
# role from context (existing project) or the managed codebuild_role (new).
|
| 1067 |
+
if get_context_bool(f"exists:{codebuild_project_name}"):
|
| 1068 |
+
project_service_role_arn = get_context_str(
|
| 1069 |
+
f"service_role_arn:{codebuild_project_name}"
|
| 1070 |
+
)
|
| 1071 |
+
if project_service_role_arn:
|
| 1072 |
+
ecr_grantee = iam.Role.from_role_arn(
|
| 1073 |
+
self,
|
| 1074 |
+
"CodeBuildProjectServiceRole",
|
| 1075 |
+
role_arn=project_service_role_arn,
|
| 1076 |
+
mutable=True,
|
| 1077 |
+
)
|
| 1078 |
+
else:
|
| 1079 |
+
ecr_grantee = codebuild_role
|
| 1080 |
+
else:
|
| 1081 |
+
ecr_grantee = codebuild_role
|
| 1082 |
+
ecr_repo.grant_pull_push(ecr_grantee)
|
| 1083 |
+
|
| 1084 |
+
except Exception as e:
|
| 1085 |
+
raise Exception("Could not handle Codebuild project due to:", e)
|
| 1086 |
+
|
| 1087 |
+
# --- Security Groups ---
|
| 1088 |
+
try:
|
| 1089 |
+
ecs_security_group_name = ECS_SECURITY_GROUP_NAME
|
| 1090 |
+
|
| 1091 |
+
try:
|
| 1092 |
+
ecs_security_group = ec2.SecurityGroup(
|
| 1093 |
+
self,
|
| 1094 |
+
"ECSSecurityGroup", # Logical ID
|
| 1095 |
+
security_group_name=ecs_security_group_name, # Explicit resource name
|
| 1096 |
+
vpc=vpc,
|
| 1097 |
+
)
|
| 1098 |
+
print(f"Created Security Group: {ecs_security_group_name}")
|
| 1099 |
+
except Exception as e: # If lookup fails, create
|
| 1100 |
+
print("Failed to create ECS security group due to:", e)
|
| 1101 |
+
|
| 1102 |
+
alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
|
| 1103 |
+
|
| 1104 |
+
try:
|
| 1105 |
+
alb_security_group = ec2.SecurityGroup(
|
| 1106 |
+
self,
|
| 1107 |
+
"ALBSecurityGroup", # Logical ID
|
| 1108 |
+
security_group_name=alb_security_group_name, # Explicit resource name
|
| 1109 |
+
vpc=vpc,
|
| 1110 |
+
)
|
| 1111 |
+
print(f"Created Security Group: {alb_security_group_name}")
|
| 1112 |
+
except Exception as e: # If lookup fails, create
|
| 1113 |
+
print("Failed to create ALB security group due to:", e)
|
| 1114 |
+
|
| 1115 |
+
# Define Ingress Rules - CDK will manage adding/removing these as needed
|
| 1116 |
+
ec2_port_gradio_server_port = ec2.Port.tcp(
|
| 1117 |
+
int(GRADIO_SERVER_PORT)
|
| 1118 |
+
) # Ensure port is int
|
| 1119 |
+
ecs_security_group.add_ingress_rule(
|
| 1120 |
+
peer=alb_security_group,
|
| 1121 |
+
connection=ec2_port_gradio_server_port,
|
| 1122 |
+
description="ALB traffic",
|
| 1123 |
+
)
|
| 1124 |
+
|
| 1125 |
+
alb_security_group.add_ingress_rule(
|
| 1126 |
+
peer=ec2.Peer.prefix_list("pl-93a247fa"),
|
| 1127 |
+
connection=ec2.Port.all_traffic(),
|
| 1128 |
+
description="CloudFront traffic",
|
| 1129 |
+
)
|
| 1130 |
+
|
| 1131 |
+
except Exception as e:
|
| 1132 |
+
raise Exception("Could not handle security groups due to:", e)
|
| 1133 |
+
|
| 1134 |
+
# --- DynamoDB tables for logs (optional) ---
|
| 1135 |
+
|
| 1136 |
+
if SAVE_LOGS_TO_DYNAMODB == "True":
|
| 1137 |
+
try:
|
| 1138 |
+
print("Creating DynamoDB tables for logs")
|
| 1139 |
+
|
| 1140 |
+
dynamodb.Table(
|
| 1141 |
+
self,
|
| 1142 |
+
"RedactionAccessDataTable",
|
| 1143 |
+
table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
|
| 1144 |
+
partition_key=dynamodb.Attribute(
|
| 1145 |
+
name="id", type=dynamodb.AttributeType.STRING
|
| 1146 |
+
),
|
| 1147 |
+
billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
|
| 1148 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 1149 |
+
)
|
| 1150 |
+
|
| 1151 |
+
dynamodb.Table(
|
| 1152 |
+
self,
|
| 1153 |
+
"RedactionFeedbackDataTable",
|
| 1154 |
+
table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
|
| 1155 |
+
partition_key=dynamodb.Attribute(
|
| 1156 |
+
name="id", type=dynamodb.AttributeType.STRING
|
| 1157 |
+
),
|
| 1158 |
+
billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
|
| 1159 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 1160 |
+
)
|
| 1161 |
+
|
| 1162 |
+
dynamodb.Table(
|
| 1163 |
+
self,
|
| 1164 |
+
"RedactionUsageDataTable",
|
| 1165 |
+
table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
|
| 1166 |
+
partition_key=dynamodb.Attribute(
|
| 1167 |
+
name="id", type=dynamodb.AttributeType.STRING
|
| 1168 |
+
),
|
| 1169 |
+
billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
|
| 1170 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 1171 |
+
)
|
| 1172 |
+
|
| 1173 |
+
except Exception as e:
|
| 1174 |
+
raise Exception("Could not create DynamoDB tables due to:", e)
|
| 1175 |
+
|
| 1176 |
+
# --- ALB ---
|
| 1177 |
+
try:
|
| 1178 |
+
load_balancer_name = ALB_NAME
|
| 1179 |
+
if len(load_balancer_name) > 32:
|
| 1180 |
+
load_balancer_name = load_balancer_name[-32:]
|
| 1181 |
+
alb_arn = get_context_str(f"arn:{load_balancer_name}") or (
|
| 1182 |
+
EXISTING_LOAD_BALANCER_ARN or None
|
| 1183 |
+
)
|
| 1184 |
+
alb_dns_name = get_context_str(f"dns:{load_balancer_name}") or (
|
| 1185 |
+
EXISTING_LOAD_BALANCER_DNS or None
|
| 1186 |
+
)
|
| 1187 |
+
if alb_arn and alb_dns_name:
|
| 1188 |
+
alb_security_group_id = (
|
| 1189 |
+
get_context_str(f"security_group_id:{load_balancer_name}")
|
| 1190 |
+
or alb_security_group.security_group_id
|
| 1191 |
+
)
|
| 1192 |
+
alb_attrs = {
|
| 1193 |
+
"load_balancer_arn": alb_arn,
|
| 1194 |
+
"load_balancer_dns_name": alb_dns_name,
|
| 1195 |
+
"security_group_id": alb_security_group_id,
|
| 1196 |
+
"vpc": vpc,
|
| 1197 |
+
}
|
| 1198 |
+
alb_canonical_zone_id = get_context_str(
|
| 1199 |
+
f"canonical_hosted_zone_id:{load_balancer_name}"
|
| 1200 |
+
)
|
| 1201 |
+
if alb_canonical_zone_id:
|
| 1202 |
+
alb_attrs["load_balancer_canonical_hosted_zone_id"] = (
|
| 1203 |
+
alb_canonical_zone_id
|
| 1204 |
+
)
|
| 1205 |
+
alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
|
| 1206 |
+
self,
|
| 1207 |
+
"ALB",
|
| 1208 |
+
**alb_attrs,
|
| 1209 |
+
)
|
| 1210 |
+
print(f"Using existing Application Load Balancer {load_balancer_name}.")
|
| 1211 |
+
else:
|
| 1212 |
+
alb = elbv2.ApplicationLoadBalancer(
|
| 1213 |
+
self,
|
| 1214 |
+
"ALB", # Logical ID
|
| 1215 |
+
load_balancer_name=load_balancer_name, # Explicit resource name
|
| 1216 |
+
vpc=vpc,
|
| 1217 |
+
internet_facing=True,
|
| 1218 |
+
security_group=alb_security_group, # Link to SG
|
| 1219 |
+
vpc_subnets=public_subnet_selection, # Link to subnets
|
| 1220 |
+
)
|
| 1221 |
+
print("Successfully created new Application Load Balancer")
|
| 1222 |
+
except Exception as e:
|
| 1223 |
+
raise Exception("Could not handle application load balancer due to:", e)
|
| 1224 |
+
|
| 1225 |
+
# --- Cognito User Pool ---
|
| 1226 |
+
try:
|
| 1227 |
+
if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
|
| 1228 |
+
# Lookup by ID from context
|
| 1229 |
+
user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
|
| 1230 |
+
if not user_pool_id:
|
| 1231 |
+
raise ValueError(
|
| 1232 |
+
f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists."
|
| 1233 |
+
)
|
| 1234 |
+
user_pool = cognito.UserPool.from_user_pool_id(
|
| 1235 |
+
self, "UserPool", user_pool_id=user_pool_id
|
| 1236 |
+
)
|
| 1237 |
+
print(f"Using existing user pool {user_pool_id}.")
|
| 1238 |
+
else:
|
| 1239 |
+
user_pool = cognito.UserPool(
|
| 1240 |
+
self,
|
| 1241 |
+
"UserPool",
|
| 1242 |
+
user_pool_name=COGNITO_USER_POOL_NAME,
|
| 1243 |
+
mfa=cognito.Mfa.OFF, # Adjust as needed
|
| 1244 |
+
sign_in_aliases=cognito.SignInAliases(email=True),
|
| 1245 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 1246 |
+
) # Adjust as needed
|
| 1247 |
+
print(f"Created new user pool {user_pool.user_pool_id}.")
|
| 1248 |
+
|
| 1249 |
+
# If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
|
| 1250 |
+
if ACM_SSL_CERTIFICATE_ARN:
|
| 1251 |
+
redirect_uris = [
|
| 1252 |
+
COGNITO_REDIRECTION_URL,
|
| 1253 |
+
COGNITO_REDIRECTION_URL + "/oauth2/idpresponse",
|
| 1254 |
+
]
|
| 1255 |
+
else:
|
| 1256 |
+
redirect_uris = [COGNITO_REDIRECTION_URL]
|
| 1257 |
+
|
| 1258 |
+
user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
|
| 1259 |
+
if get_context_bool(f"exists:{user_pool_client_name}"):
|
| 1260 |
+
# Lookup by ID from context (requires User Pool object)
|
| 1261 |
+
user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
|
| 1262 |
+
if not user_pool_client_id:
|
| 1263 |
+
raise ValueError(
|
| 1264 |
+
f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists."
|
| 1265 |
+
)
|
| 1266 |
+
user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(
|
| 1267 |
+
self, "UserPoolClient", user_pool_client_id=user_pool_client_id
|
| 1268 |
+
)
|
| 1269 |
+
print(f"Using existing user pool client {user_pool_client_id}.")
|
| 1270 |
+
else:
|
| 1271 |
+
user_pool_client = cognito.UserPoolClient(
|
| 1272 |
+
self,
|
| 1273 |
+
"UserPoolClient",
|
| 1274 |
+
auth_flows=cognito.AuthFlow(
|
| 1275 |
+
user_srp=True, user_password=True
|
| 1276 |
+
), # Example: enable SRP for secure sign-in
|
| 1277 |
+
user_pool=user_pool,
|
| 1278 |
+
generate_secret=True,
|
| 1279 |
+
user_pool_client_name=user_pool_client_name,
|
| 1280 |
+
supported_identity_providers=[
|
| 1281 |
+
cognito.UserPoolClientIdentityProvider.COGNITO
|
| 1282 |
+
],
|
| 1283 |
+
o_auth=cognito.OAuthSettings(
|
| 1284 |
+
flows=cognito.OAuthFlows(authorization_code_grant=True),
|
| 1285 |
+
scopes=[
|
| 1286 |
+
cognito.OAuthScope.OPENID,
|
| 1287 |
+
cognito.OAuthScope.EMAIL,
|
| 1288 |
+
cognito.OAuthScope.PROFILE,
|
| 1289 |
+
],
|
| 1290 |
+
callback_urls=redirect_uris,
|
| 1291 |
+
),
|
| 1292 |
+
refresh_token_validity=Duration.minutes(
|
| 1293 |
+
COGNITO_REFRESH_TOKEN_VALIDITY
|
| 1294 |
+
),
|
| 1295 |
+
id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
|
| 1296 |
+
access_token_validity=Duration.minutes(
|
| 1297 |
+
COGNITO_ACCESS_TOKEN_VALIDITY
|
| 1298 |
+
),
|
| 1299 |
+
)
|
| 1300 |
+
|
| 1301 |
+
CfnOutput(
|
| 1302 |
+
self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id
|
| 1303 |
+
)
|
| 1304 |
+
|
| 1305 |
+
print(
|
| 1306 |
+
f"Created new user pool client {user_pool_client.user_pool_client_id}."
|
| 1307 |
+
)
|
| 1308 |
+
|
| 1309 |
+
# Add a domain to the User Pool (crucial for ALB integration)
|
| 1310 |
+
user_pool_domain = user_pool.add_domain(
|
| 1311 |
+
"UserPoolDomain",
|
| 1312 |
+
cognito_domain=cognito.CognitoDomainOptions(
|
| 1313 |
+
domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX
|
| 1314 |
+
),
|
| 1315 |
+
)
|
| 1316 |
+
|
| 1317 |
+
# Apply removal_policy to the created UserPoolDomain construct
|
| 1318 |
+
user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
|
| 1319 |
+
|
| 1320 |
+
CfnOutput(
|
| 1321 |
+
self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url()
|
| 1322 |
+
)
|
| 1323 |
+
|
| 1324 |
+
except Exception as e:
|
| 1325 |
+
raise Exception("Could not handle Cognito resources due to:", e)
|
| 1326 |
+
|
| 1327 |
+
# --- Secrets Manager Secret ---
|
| 1328 |
+
try:
|
| 1329 |
+
secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
|
| 1330 |
+
if get_context_bool(f"exists:{secret_name}"):
|
| 1331 |
+
# Lookup by name
|
| 1332 |
+
secret = secretsmanager.Secret.from_secret_name_v2(
|
| 1333 |
+
self, "CognitoSecret", secret_name=secret_name
|
| 1334 |
+
)
|
| 1335 |
+
print("Using existing Secret.")
|
| 1336 |
+
else:
|
| 1337 |
+
if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
|
| 1338 |
+
secret = secretsmanager.Secret(
|
| 1339 |
+
self,
|
| 1340 |
+
"CognitoSecret", # Logical ID
|
| 1341 |
+
secret_name=secret_name, # Explicit resource name
|
| 1342 |
+
secret_object_value={
|
| 1343 |
+
"REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
|
| 1344 |
+
user_pool.user_pool_id
|
| 1345 |
+
), # Use the CDK attribute
|
| 1346 |
+
"REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
|
| 1347 |
+
user_pool_client.user_pool_client_id
|
| 1348 |
+
), # Use the CDK attribute
|
| 1349 |
+
"REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
|
| 1350 |
+
},
|
| 1351 |
+
encryption_key=kms_key,
|
| 1352 |
+
)
|
| 1353 |
+
else:
|
| 1354 |
+
secret = secretsmanager.Secret(
|
| 1355 |
+
self,
|
| 1356 |
+
"CognitoSecret", # Logical ID
|
| 1357 |
+
secret_name=secret_name, # Explicit resource name
|
| 1358 |
+
secret_object_value={
|
| 1359 |
+
"REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
|
| 1360 |
+
user_pool.user_pool_id
|
| 1361 |
+
), # Use the CDK attribute
|
| 1362 |
+
"REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
|
| 1363 |
+
user_pool_client.user_pool_client_id
|
| 1364 |
+
), # Use the CDK attribute
|
| 1365 |
+
"REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
|
| 1366 |
+
},
|
| 1367 |
+
)
|
| 1368 |
+
|
| 1369 |
+
print(
|
| 1370 |
+
"Created new secret in Secrets Manager for Cognito user pool and related details."
|
| 1371 |
+
)
|
| 1372 |
+
|
| 1373 |
+
except Exception as e:
|
| 1374 |
+
raise Exception("Could not handle Secrets Manager secret due to:", e)
|
| 1375 |
+
|
| 1376 |
+
# --- Fargate Task Definition ---
|
| 1377 |
+
try:
|
| 1378 |
+
fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
|
| 1379 |
+
|
| 1380 |
+
read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == "True"
|
| 1381 |
+
|
| 1382 |
+
if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
|
| 1383 |
+
with open(TASK_DEFINITION_FILE_LOCATION) as f: # Use correct path
|
| 1384 |
+
task_def_params = json.load(f)
|
| 1385 |
+
# Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
|
| 1386 |
+
else:
|
| 1387 |
+
epheremal_storage_volume_name = "appEphemeralVolume"
|
| 1388 |
+
|
| 1389 |
+
task_def_params = {}
|
| 1390 |
+
task_def_params["taskRoleArn"] = (
|
| 1391 |
+
task_role.role_arn
|
| 1392 |
+
) # Use CDK role object ARN
|
| 1393 |
+
task_def_params["executionRoleArn"] = (
|
| 1394 |
+
execution_role.role_arn
|
| 1395 |
+
) # Use CDK role object ARN
|
| 1396 |
+
task_def_params["memory"] = ECS_TASK_MEMORY_SIZE
|
| 1397 |
+
task_def_params["cpu"] = ECS_TASK_CPU_SIZE
|
| 1398 |
+
container_def = {
|
| 1399 |
+
"name": full_ecr_repo_name,
|
| 1400 |
+
"image": ecr_image_loc + ":latest",
|
| 1401 |
+
"essential": True,
|
| 1402 |
+
"portMappings": [
|
| 1403 |
+
{
|
| 1404 |
+
"containerPort": int(GRADIO_SERVER_PORT),
|
| 1405 |
+
"hostPort": int(GRADIO_SERVER_PORT),
|
| 1406 |
+
"protocol": "tcp",
|
| 1407 |
+
"appProtocol": "http",
|
| 1408 |
+
}
|
| 1409 |
+
],
|
| 1410 |
+
"logConfiguration": {
|
| 1411 |
+
"logDriver": "awslogs",
|
| 1412 |
+
"options": {
|
| 1413 |
+
"awslogs-group": ECS_LOG_GROUP_NAME,
|
| 1414 |
+
"awslogs-region": AWS_REGION,
|
| 1415 |
+
"awslogs-stream-prefix": "ecs",
|
| 1416 |
+
},
|
| 1417 |
+
},
|
| 1418 |
+
"environmentFiles": [
|
| 1419 |
+
{"value": bucket.bucket_arn + "/config.env", "type": "s3"}
|
| 1420 |
+
],
|
| 1421 |
+
"memoryReservation": int(task_def_params["memory"])
|
| 1422 |
+
- 512, # Reserve some memory for the container
|
| 1423 |
+
"mountPoints": [
|
| 1424 |
+
{
|
| 1425 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1426 |
+
"containerPath": "/home/user/app/logs",
|
| 1427 |
+
"readOnly": False,
|
| 1428 |
+
},
|
| 1429 |
+
{
|
| 1430 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1431 |
+
"containerPath": "/home/user/app/feedback",
|
| 1432 |
+
"readOnly": False,
|
| 1433 |
+
},
|
| 1434 |
+
{
|
| 1435 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1436 |
+
"containerPath": "/home/user/app/usage",
|
| 1437 |
+
"readOnly": False,
|
| 1438 |
+
},
|
| 1439 |
+
{
|
| 1440 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1441 |
+
"containerPath": "/home/user/app/input",
|
| 1442 |
+
"readOnly": False,
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1446 |
+
"containerPath": "/home/user/app/output",
|
| 1447 |
+
"readOnly": False,
|
| 1448 |
+
},
|
| 1449 |
+
{
|
| 1450 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1451 |
+
"containerPath": "/home/user/app/tmp",
|
| 1452 |
+
"readOnly": False,
|
| 1453 |
+
},
|
| 1454 |
+
{
|
| 1455 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1456 |
+
"containerPath": "/home/user/app/config",
|
| 1457 |
+
"readOnly": False,
|
| 1458 |
+
},
|
| 1459 |
+
{
|
| 1460 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1461 |
+
"containerPath": "/tmp/matplotlib_cache",
|
| 1462 |
+
"readOnly": False,
|
| 1463 |
+
},
|
| 1464 |
+
{
|
| 1465 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1466 |
+
"containerPath": "/tmp",
|
| 1467 |
+
"readOnly": False,
|
| 1468 |
+
},
|
| 1469 |
+
{
|
| 1470 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1471 |
+
"containerPath": "/var/tmp",
|
| 1472 |
+
"readOnly": False,
|
| 1473 |
+
},
|
| 1474 |
+
{
|
| 1475 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1476 |
+
"containerPath": "/tmp/tld",
|
| 1477 |
+
"readOnly": False,
|
| 1478 |
+
},
|
| 1479 |
+
{
|
| 1480 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1481 |
+
"containerPath": "/tmp/gradio_tmp",
|
| 1482 |
+
"readOnly": False,
|
| 1483 |
+
},
|
| 1484 |
+
{
|
| 1485 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1486 |
+
"containerPath": "/home/user/.paddlex",
|
| 1487 |
+
"readOnly": False,
|
| 1488 |
+
},
|
| 1489 |
+
{
|
| 1490 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1491 |
+
"containerPath": "/home/user/.local/share/spacy/data",
|
| 1492 |
+
"readOnly": False,
|
| 1493 |
+
},
|
| 1494 |
+
{
|
| 1495 |
+
"sourceVolume": epheremal_storage_volume_name,
|
| 1496 |
+
"containerPath": "/usr/share/tessdata",
|
| 1497 |
+
"readOnly": False,
|
| 1498 |
+
},
|
| 1499 |
+
],
|
| 1500 |
+
"readonlyRootFilesystem": read_only_file_system,
|
| 1501 |
+
}
|
| 1502 |
+
task_def_params["containerDefinitions"] = [container_def]
|
| 1503 |
+
|
| 1504 |
+
log_group_name_from_config = task_def_params["containerDefinitions"][0][
|
| 1505 |
+
"logConfiguration"
|
| 1506 |
+
]["options"]["awslogs-group"]
|
| 1507 |
+
|
| 1508 |
+
cdk_managed_log_group = logs.LogGroup(
|
| 1509 |
+
self,
|
| 1510 |
+
"MyTaskLogGroup", # CDK Logical ID
|
| 1511 |
+
log_group_name=log_group_name_from_config,
|
| 1512 |
+
retention=logs.RetentionDays.ONE_MONTH,
|
| 1513 |
+
removal_policy=RemovalPolicy.DESTROY,
|
| 1514 |
+
)
|
| 1515 |
+
|
| 1516 |
+
epheremal_storage_volume_cdk_obj = ecs.Volume(
|
| 1517 |
+
name=epheremal_storage_volume_name
|
| 1518 |
+
)
|
| 1519 |
+
|
| 1520 |
+
fargate_task_definition = ecs.FargateTaskDefinition(
|
| 1521 |
+
self,
|
| 1522 |
+
"FargateTaskDefinition", # Logical ID
|
| 1523 |
+
family=fargate_task_definition_name,
|
| 1524 |
+
cpu=int(task_def_params["cpu"]),
|
| 1525 |
+
memory_limit_mib=int(task_def_params["memory"]),
|
| 1526 |
+
task_role=task_role,
|
| 1527 |
+
execution_role=execution_role,
|
| 1528 |
+
runtime_platform=ecs.RuntimePlatform(
|
| 1529 |
+
cpu_architecture=ecs.CpuArchitecture.X86_64,
|
| 1530 |
+
operating_system_family=ecs.OperatingSystemFamily.LINUX,
|
| 1531 |
+
),
|
| 1532 |
+
ephemeral_storage_gib=21, # Minimum is 21 GiB
|
| 1533 |
+
volumes=[epheremal_storage_volume_cdk_obj],
|
| 1534 |
+
)
|
| 1535 |
+
print("Fargate task definition defined.")
|
| 1536 |
+
|
| 1537 |
+
# Add container definitions to the task definition object
|
| 1538 |
+
if task_def_params["containerDefinitions"]:
|
| 1539 |
+
container_def_params = task_def_params["containerDefinitions"][0]
|
| 1540 |
+
|
| 1541 |
+
if container_def_params.get("environmentFiles"):
|
| 1542 |
+
env_files = []
|
| 1543 |
+
for env_file_param in container_def_params["environmentFiles"]:
|
| 1544 |
+
# Need to parse the ARN to get the bucket object and key
|
| 1545 |
+
env_file_arn_parts = env_file_param["value"].split(":::")
|
| 1546 |
+
bucket_name_and_key = env_file_arn_parts[-1]
|
| 1547 |
+
env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
|
| 1548 |
+
|
| 1549 |
+
env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
|
| 1550 |
+
|
| 1551 |
+
env_files.append(env_file)
|
| 1552 |
+
|
| 1553 |
+
container = fargate_task_definition.add_container(
|
| 1554 |
+
container_def_params["name"],
|
| 1555 |
+
image=ecs.ContainerImage.from_registry(
|
| 1556 |
+
container_def_params["image"]
|
| 1557 |
+
),
|
| 1558 |
+
logging=ecs.LogDriver.aws_logs(
|
| 1559 |
+
stream_prefix=container_def_params["logConfiguration"][
|
| 1560 |
+
"options"
|
| 1561 |
+
]["awslogs-stream-prefix"],
|
| 1562 |
+
log_group=cdk_managed_log_group,
|
| 1563 |
+
),
|
| 1564 |
+
secrets={
|
| 1565 |
+
"AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(
|
| 1566 |
+
secret, "REDACTION_USER_POOL_ID"
|
| 1567 |
+
),
|
| 1568 |
+
"AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(
|
| 1569 |
+
secret, "REDACTION_CLIENT_ID"
|
| 1570 |
+
),
|
| 1571 |
+
"AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(
|
| 1572 |
+
secret, "REDACTION_CLIENT_SECRET"
|
| 1573 |
+
),
|
| 1574 |
+
},
|
| 1575 |
+
environment_files=env_files,
|
| 1576 |
+
readonly_root_filesystem=read_only_file_system,
|
| 1577 |
+
)
|
| 1578 |
+
|
| 1579 |
+
for port_mapping in container_def_params["portMappings"]:
|
| 1580 |
+
container.add_port_mappings(
|
| 1581 |
+
ecs.PortMapping(
|
| 1582 |
+
container_port=int(port_mapping["containerPort"]),
|
| 1583 |
+
host_port=int(port_mapping["hostPort"]),
|
| 1584 |
+
name="port-" + str(port_mapping["containerPort"]),
|
| 1585 |
+
app_protocol=ecs.AppProtocol.http,
|
| 1586 |
+
protocol=ecs.Protocol.TCP,
|
| 1587 |
+
)
|
| 1588 |
+
)
|
| 1589 |
+
|
| 1590 |
+
container.add_port_mappings(
|
| 1591 |
+
ecs.PortMapping(
|
| 1592 |
+
container_port=80,
|
| 1593 |
+
host_port=80,
|
| 1594 |
+
name="port-80",
|
| 1595 |
+
app_protocol=ecs.AppProtocol.http,
|
| 1596 |
+
protocol=ecs.Protocol.TCP,
|
| 1597 |
+
)
|
| 1598 |
+
)
|
| 1599 |
+
|
| 1600 |
+
if container_def_params.get("mountPoints"):
|
| 1601 |
+
mount_points = []
|
| 1602 |
+
for mount_point in container_def_params["mountPoints"]:
|
| 1603 |
+
mount_points.append(
|
| 1604 |
+
ecs.MountPoint(
|
| 1605 |
+
container_path=mount_point["containerPath"],
|
| 1606 |
+
read_only=mount_point["readOnly"],
|
| 1607 |
+
source_volume=epheremal_storage_volume_name,
|
| 1608 |
+
)
|
| 1609 |
+
)
|
| 1610 |
+
container.add_mount_points(*mount_points)
|
| 1611 |
+
|
| 1612 |
+
except Exception as e:
|
| 1613 |
+
raise Exception("Could not handle Fargate task definition due to:", e)
|
| 1614 |
+
|
| 1615 |
+
# --- ECS Cluster ---
|
| 1616 |
+
try:
|
| 1617 |
+
cluster = ecs.Cluster(
|
| 1618 |
+
self,
|
| 1619 |
+
"ECSCluster", # Logical ID
|
| 1620 |
+
cluster_name=CLUSTER_NAME, # Explicit resource name
|
| 1621 |
+
enable_fargate_capacity_providers=True,
|
| 1622 |
+
vpc=vpc,
|
| 1623 |
+
)
|
| 1624 |
+
print("Successfully created new ECS cluster")
|
| 1625 |
+
except Exception as e:
|
| 1626 |
+
raise Exception("Could not handle ECS cluster due to:", e)
|
| 1627 |
+
|
| 1628 |
+
# --- ECS Service ---
|
| 1629 |
+
try:
|
| 1630 |
+
ecs_service_name = ECS_SERVICE_NAME
|
| 1631 |
+
|
| 1632 |
+
if ECS_USE_FARGATE_SPOT == "True":
|
| 1633 |
+
use_fargate_spot = "FARGATE_SPOT"
|
| 1634 |
+
if ECS_USE_FARGATE_SPOT == "False":
|
| 1635 |
+
use_fargate_spot = "FARGATE"
|
| 1636 |
+
|
| 1637 |
+
# Check if service exists - from_service_arn or from_service_name (needs cluster)
|
| 1638 |
+
try:
|
| 1639 |
+
# from_service_name is useful if you have the cluster object
|
| 1640 |
+
ecs_service = ecs.FargateService.from_service_attributes(
|
| 1641 |
+
self,
|
| 1642 |
+
"ECSService", # Logical ID
|
| 1643 |
+
cluster=cluster, # Requires the cluster object
|
| 1644 |
+
service_name=ecs_service_name,
|
| 1645 |
+
)
|
| 1646 |
+
print(f"Using existing ECS service {ecs_service_name}.")
|
| 1647 |
+
except Exception:
|
| 1648 |
+
# Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
|
| 1649 |
+
ecs_service = ecs.FargateService(
|
| 1650 |
+
self,
|
| 1651 |
+
"ECSService", # Logical ID
|
| 1652 |
+
service_name=ecs_service_name, # Explicit resource name
|
| 1653 |
+
platform_version=ecs.FargatePlatformVersion.LATEST,
|
| 1654 |
+
capacity_provider_strategies=[
|
| 1655 |
+
ecs.CapacityProviderStrategy(
|
| 1656 |
+
capacity_provider=use_fargate_spot, base=0, weight=1
|
| 1657 |
+
)
|
| 1658 |
+
],
|
| 1659 |
+
cluster=cluster,
|
| 1660 |
+
task_definition=fargate_task_definition, # Link to TD
|
| 1661 |
+
security_groups=[ecs_security_group], # Link to SG
|
| 1662 |
+
vpc_subnets=ec2.SubnetSelection(
|
| 1663 |
+
subnets=self.private_subnets
|
| 1664 |
+
), # Link to subnets
|
| 1665 |
+
min_healthy_percent=0,
|
| 1666 |
+
max_healthy_percent=100,
|
| 1667 |
+
desired_count=0,
|
| 1668 |
+
)
|
| 1669 |
+
print("Successfully created new ECS service")
|
| 1670 |
+
|
| 1671 |
+
# Note: Auto-scaling setup would typically go here if needed for the service
|
| 1672 |
+
|
| 1673 |
+
except Exception as e:
|
| 1674 |
+
raise Exception("Could not handle ECS service due to:", e)
|
| 1675 |
+
|
| 1676 |
+
# --- Grant Secret Read Access (Applies to both created and imported roles) ---
|
| 1677 |
+
try:
|
| 1678 |
+
secret.grant_read(task_role)
|
| 1679 |
+
secret.grant_read(execution_role)
|
| 1680 |
+
except Exception as e:
|
| 1681 |
+
raise Exception("Could not grant access to Secrets Manager due to:", e)
|
| 1682 |
+
|
| 1683 |
+
# --- ALB TARGET GROUPS AND LISTENERS ---
|
| 1684 |
+
# This section should primarily define the resources if they are managed by this stack.
|
| 1685 |
+
# CDK handles adding/removing targets and actions on updates.
|
| 1686 |
+
# If they might pre-exist outside the stack, you need lookups.
|
| 1687 |
+
cookie_duration = Duration.hours(12)
|
| 1688 |
+
target_group_name = ALB_TARGET_GROUP_NAME # Explicit resource name
|
| 1689 |
+
cloudfront_distribution_url = "cloudfront_placeholder.net" # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
|
| 1690 |
+
|
| 1691 |
+
try:
|
| 1692 |
+
# --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
|
| 1693 |
+
|
| 1694 |
+
target_group = elbv2.ApplicationTargetGroup(
|
| 1695 |
+
self,
|
| 1696 |
+
"AppTargetGroup", # Logical ID
|
| 1697 |
+
target_group_name=target_group_name, # Explicit resource name
|
| 1698 |
+
port=int(GRADIO_SERVER_PORT), # Ensure port is int
|
| 1699 |
+
protocol=elbv2.ApplicationProtocol.HTTP,
|
| 1700 |
+
targets=[ecs_service], # Link to ECS Service
|
| 1701 |
+
stickiness_cookie_duration=cookie_duration,
|
| 1702 |
+
vpc=vpc, # Target Groups need VPC
|
| 1703 |
+
)
|
| 1704 |
+
print(f"ALB target group {target_group_name} defined.")
|
| 1705 |
+
|
| 1706 |
+
# First HTTP
|
| 1707 |
+
listener_port = 80
|
| 1708 |
+
# Check if Listener exists - from_listener_arn or lookup by port/ALB
|
| 1709 |
+
|
| 1710 |
+
http_listener = alb.add_listener(
|
| 1711 |
+
"HttpListener", # Logical ID
|
| 1712 |
+
port=listener_port,
|
| 1713 |
+
open=False, # Be cautious with open=True, usually restrict source SG
|
| 1714 |
+
)
|
| 1715 |
+
print(f"ALB listener on port {listener_port} defined.")
|
| 1716 |
+
|
| 1717 |
+
if ACM_SSL_CERTIFICATE_ARN:
|
| 1718 |
+
http_listener.add_action(
|
| 1719 |
+
"DefaultAction", # Logical ID for the default action
|
| 1720 |
+
action=elbv2.ListenerAction.redirect(
|
| 1721 |
+
protocol="HTTPS",
|
| 1722 |
+
host="#{host}",
|
| 1723 |
+
port="443",
|
| 1724 |
+
path="/#{path}",
|
| 1725 |
+
query="#{query}",
|
| 1726 |
+
),
|
| 1727 |
+
)
|
| 1728 |
+
else:
|
| 1729 |
+
if USE_CLOUDFRONT == "True":
|
| 1730 |
+
|
| 1731 |
+
# The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
|
| 1732 |
+
http_listener.add_action(
|
| 1733 |
+
"DefaultAction", # Logical ID for the default action
|
| 1734 |
+
action=elbv2.ListenerAction.fixed_response(
|
| 1735 |
+
status_code=403,
|
| 1736 |
+
content_type="text/plain",
|
| 1737 |
+
message_body="Access denied",
|
| 1738 |
+
),
|
| 1739 |
+
)
|
| 1740 |
+
|
| 1741 |
+
# Add the Listener Rule for the specific CloudFront Host Header
|
| 1742 |
+
http_listener.add_action(
|
| 1743 |
+
"CloudFrontHostHeaderRule",
|
| 1744 |
+
action=elbv2.ListenerAction.forward(
|
| 1745 |
+
target_groups=[target_group],
|
| 1746 |
+
stickiness_duration=cookie_duration,
|
| 1747 |
+
),
|
| 1748 |
+
priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
|
| 1749 |
+
conditions=[
|
| 1750 |
+
elbv2.ListenerCondition.host_headers(
|
| 1751 |
+
[cloudfront_distribution_url]
|
| 1752 |
+
) # May have to redefine url in console afterwards if not specified in config file
|
| 1753 |
+
],
|
| 1754 |
+
)
|
| 1755 |
+
|
| 1756 |
+
else:
|
| 1757 |
+
# Add the Listener Rule for the specific CloudFront Host Header
|
| 1758 |
+
http_listener.add_action(
|
| 1759 |
+
"CloudFrontHostHeaderRule",
|
| 1760 |
+
action=elbv2.ListenerAction.forward(
|
| 1761 |
+
target_groups=[target_group],
|
| 1762 |
+
stickiness_duration=cookie_duration,
|
| 1763 |
+
),
|
| 1764 |
+
)
|
| 1765 |
+
|
| 1766 |
+
print("Added targets and actions to ALB HTTP listener.")
|
| 1767 |
+
|
| 1768 |
+
# Now the same for HTTPS if you have an ACM certificate
|
| 1769 |
+
if ACM_SSL_CERTIFICATE_ARN:
|
| 1770 |
+
listener_port_https = 443
|
| 1771 |
+
# Check if Listener exists - from_listener_arn or lookup by port/ALB
|
| 1772 |
+
|
| 1773 |
+
https_listener = add_alb_https_listener_with_cert(
|
| 1774 |
+
self,
|
| 1775 |
+
"MyHttpsListener", # Logical ID for the HTTPS listener
|
| 1776 |
+
alb,
|
| 1777 |
+
acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
|
| 1778 |
+
default_target_group=target_group,
|
| 1779 |
+
enable_cognito_auth=True,
|
| 1780 |
+
cognito_user_pool=user_pool,
|
| 1781 |
+
cognito_user_pool_client=user_pool_client,
|
| 1782 |
+
cognito_user_pool_domain=user_pool_domain,
|
| 1783 |
+
listener_open_to_internet=True,
|
| 1784 |
+
stickiness_cookie_duration=cookie_duration,
|
| 1785 |
+
)
|
| 1786 |
+
|
| 1787 |
+
if https_listener:
|
| 1788 |
+
CfnOutput(
|
| 1789 |
+
self, "HttpsListenerArn", value=https_listener.listener_arn
|
| 1790 |
+
)
|
| 1791 |
+
|
| 1792 |
+
print(f"ALB listener on port {listener_port_https} defined.")
|
| 1793 |
+
|
| 1794 |
+
# if USE_CLOUDFRONT == 'True':
|
| 1795 |
+
# # Add default action to the listener
|
| 1796 |
+
# https_listener.add_action(
|
| 1797 |
+
# "DefaultAction", # Logical ID for the default action
|
| 1798 |
+
# action=elbv2.ListenerAction.fixed_response(
|
| 1799 |
+
# status_code=403,
|
| 1800 |
+
# content_type="text/plain",
|
| 1801 |
+
# message_body="Access denied",
|
| 1802 |
+
# ),
|
| 1803 |
+
# )
|
| 1804 |
+
|
| 1805 |
+
# # Add the Listener Rule for the specific CloudFront Host Header
|
| 1806 |
+
# https_listener.add_action(
|
| 1807 |
+
# "CloudFrontHostHeaderRuleHTTPS",
|
| 1808 |
+
# action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
|
| 1809 |
+
# priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
|
| 1810 |
+
# conditions=[
|
| 1811 |
+
# elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
|
| 1812 |
+
# ]
|
| 1813 |
+
# )
|
| 1814 |
+
# else:
|
| 1815 |
+
# https_listener.add_action(
|
| 1816 |
+
# "CloudFrontHostHeaderRuleHTTPS",
|
| 1817 |
+
# action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
|
| 1818 |
+
|
| 1819 |
+
print("Added targets and actions to ALB HTTPS listener.")
|
| 1820 |
+
|
| 1821 |
+
except Exception as e:
|
| 1822 |
+
raise Exception(
|
| 1823 |
+
"Could not handle ALB target groups and listeners due to:", e
|
| 1824 |
+
)
|
| 1825 |
+
|
| 1826 |
+
# Create WAF to attach to load balancer
|
| 1827 |
+
try:
|
| 1828 |
+
web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
|
| 1829 |
+
if get_context_bool(f"exists:{web_acl_name}"):
|
| 1830 |
+
# Lookup WAF ACL by ARN from context
|
| 1831 |
+
web_acl_arn = get_context_str(f"arn:{web_acl_name}")
|
| 1832 |
+
if not web_acl_arn:
|
| 1833 |
+
raise ValueError(
|
| 1834 |
+
f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
|
| 1835 |
+
)
|
| 1836 |
+
|
| 1837 |
+
web_acl = create_web_acl_with_common_rules(
|
| 1838 |
+
self, web_acl_name, waf_scope="REGIONAL"
|
| 1839 |
+
) # Assuming it takes scope and name
|
| 1840 |
+
print(f"Handled ALB WAF web ACL {web_acl_name}.")
|
| 1841 |
+
else:
|
| 1842 |
+
web_acl = create_web_acl_with_common_rules(
|
| 1843 |
+
self, web_acl_name, waf_scope="REGIONAL"
|
| 1844 |
+
) # Assuming it takes scope and name
|
| 1845 |
+
print(f"Created ALB WAF web ACL {web_acl_name}.")
|
| 1846 |
+
|
| 1847 |
+
wafv2.CfnWebACLAssociation(
|
| 1848 |
+
self,
|
| 1849 |
+
id="alb_waf_association",
|
| 1850 |
+
resource_arn=alb.load_balancer_arn,
|
| 1851 |
+
web_acl_arn=web_acl.attr_arn,
|
| 1852 |
+
)
|
| 1853 |
+
|
| 1854 |
+
except Exception as e:
|
| 1855 |
+
raise Exception("Could not handle create ALB WAF web ACL due to:", e)
|
| 1856 |
+
|
| 1857 |
+
# --- Outputs for other stacks/regions ---
|
| 1858 |
+
|
| 1859 |
+
self.params = dict()
|
| 1860 |
+
self.params["alb_arn_output"] = alb.load_balancer_arn
|
| 1861 |
+
self.params["alb_security_group_id"] = alb_security_group.security_group_id
|
| 1862 |
+
self.params["alb_dns_name"] = alb.load_balancer_dns_name
|
| 1863 |
+
|
| 1864 |
+
CfnOutput(
|
| 1865 |
+
self,
|
| 1866 |
+
"AlbArnOutput",
|
| 1867 |
+
value=alb.load_balancer_arn,
|
| 1868 |
+
description="ARN of the Application Load Balancer",
|
| 1869 |
+
export_name=f"{self.stack_name}-AlbArn",
|
| 1870 |
+
) # Export name must be unique within the account/region
|
| 1871 |
+
|
| 1872 |
+
CfnOutput(
|
| 1873 |
+
self,
|
| 1874 |
+
"AlbSecurityGroupIdOutput",
|
| 1875 |
+
value=alb_security_group.security_group_id,
|
| 1876 |
+
description="ID of the ALB's Security Group",
|
| 1877 |
+
export_name=f"{self.stack_name}-AlbSgId",
|
| 1878 |
+
)
|
| 1879 |
+
CfnOutput(self, "ALBName", value=load_balancer_name)
|
| 1880 |
+
|
| 1881 |
+
CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
|
| 1882 |
+
|
| 1883 |
+
CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
|
| 1884 |
+
# Add other outputs if needed
|
| 1885 |
+
|
| 1886 |
+
CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
|
| 1887 |
+
|
| 1888 |
+
|
| 1889 |
+
# --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
|
| 1890 |
+
class CdkStackCloudfront(Stack):
|
| 1891 |
+
|
| 1892 |
+
def __init__(
|
| 1893 |
+
self,
|
| 1894 |
+
scope: Construct,
|
| 1895 |
+
construct_id: str,
|
| 1896 |
+
alb_arn: str,
|
| 1897 |
+
alb_sec_group_id: str,
|
| 1898 |
+
alb_dns_name: str,
|
| 1899 |
+
**kwargs,
|
| 1900 |
+
) -> None:
|
| 1901 |
+
super().__init__(scope, construct_id, **kwargs)
|
| 1902 |
+
|
| 1903 |
+
# --- Helper to get context values ---
|
| 1904 |
+
def get_context_bool(key: str, default: bool = False) -> bool:
|
| 1905 |
+
return self.node.try_get_context(key) or default
|
| 1906 |
+
|
| 1907 |
+
def get_context_str(key: str, default: str = None) -> str:
|
| 1908 |
+
return self.node.try_get_context(key) or default
|
| 1909 |
+
|
| 1910 |
+
def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
|
| 1911 |
+
return scope.node.try_get_context(key) or default
|
| 1912 |
+
|
| 1913 |
+
print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
|
| 1914 |
+
print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
|
| 1915 |
+
|
| 1916 |
+
if not alb_arn:
|
| 1917 |
+
raise ValueError("ALB ARN must be provided to CloudFront stack")
|
| 1918 |
+
if not alb_sec_group_id:
|
| 1919 |
+
raise ValueError(
|
| 1920 |
+
"ALB Security Group ID must be provided to CloudFront stack"
|
| 1921 |
+
)
|
| 1922 |
+
|
| 1923 |
+
# 2. Import the ALB using its ARN
|
| 1924 |
+
# This imports an existing ALB as a construct in the CloudFront stack's context.
|
| 1925 |
+
# CloudFormation will understand this reference at deploy time.
|
| 1926 |
+
alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
|
| 1927 |
+
self,
|
| 1928 |
+
"ImportedAlb",
|
| 1929 |
+
load_balancer_arn=alb_arn,
|
| 1930 |
+
security_group_id=alb_sec_group_id,
|
| 1931 |
+
load_balancer_dns_name=alb_dns_name,
|
| 1932 |
+
)
|
| 1933 |
+
|
| 1934 |
+
try:
|
| 1935 |
+
web_acl_name = WEB_ACL_NAME
|
| 1936 |
+
if get_context_bool(f"exists:{web_acl_name}"):
|
| 1937 |
+
# Lookup WAF ACL by ARN from context
|
| 1938 |
+
web_acl_arn = get_context_str(f"arn:{web_acl_name}")
|
| 1939 |
+
if not web_acl_arn:
|
| 1940 |
+
raise ValueError(
|
| 1941 |
+
f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
|
| 1942 |
+
)
|
| 1943 |
+
|
| 1944 |
+
web_acl = create_web_acl_with_common_rules(
|
| 1945 |
+
self, web_acl_name
|
| 1946 |
+
) # Assuming it takes scope and name
|
| 1947 |
+
print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
|
| 1948 |
+
else:
|
| 1949 |
+
web_acl = create_web_acl_with_common_rules(
|
| 1950 |
+
self, web_acl_name
|
| 1951 |
+
) # Assuming it takes scope and name
|
| 1952 |
+
print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
|
| 1953 |
+
|
| 1954 |
+
# Add ALB as CloudFront Origin
|
| 1955 |
+
origin = origins.LoadBalancerV2Origin(
|
| 1956 |
+
alb, # Use the created or looked-up ALB object
|
| 1957 |
+
custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
|
| 1958 |
+
origin_shield_enabled=False,
|
| 1959 |
+
protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
|
| 1960 |
+
)
|
| 1961 |
+
|
| 1962 |
+
if CLOUDFRONT_GEO_RESTRICTION:
|
| 1963 |
+
geo_restrict = cloudfront.GeoRestriction.allowlist(
|
| 1964 |
+
CLOUDFRONT_GEO_RESTRICTION
|
| 1965 |
+
)
|
| 1966 |
+
else:
|
| 1967 |
+
geo_restrict = None
|
| 1968 |
+
|
| 1969 |
+
cloudfront_distribution = cloudfront.Distribution(
|
| 1970 |
+
self,
|
| 1971 |
+
"CloudFrontDistribution", # Logical ID
|
| 1972 |
+
comment=CLOUDFRONT_DISTRIBUTION_NAME, # Use name as comment for easier identification
|
| 1973 |
+
geo_restriction=geo_restrict,
|
| 1974 |
+
default_behavior=cloudfront.BehaviorOptions(
|
| 1975 |
+
origin=origin,
|
| 1976 |
+
viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
|
| 1977 |
+
allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
|
| 1978 |
+
cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
|
| 1979 |
+
origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
|
| 1980 |
+
),
|
| 1981 |
+
web_acl_id=web_acl.attr_arn,
|
| 1982 |
+
)
|
| 1983 |
+
print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
|
| 1984 |
+
|
| 1985 |
+
except Exception as e:
|
| 1986 |
+
raise Exception("Could not handle Cloudfront distribution due to:", e)
|
| 1987 |
+
|
| 1988 |
+
# --- Outputs ---
|
| 1989 |
+
CfnOutput(
|
| 1990 |
+
self, "CloudFrontDistributionURL", value=cloudfront_distribution.domain_name
|
| 1991 |
+
)
|
cdk/check_resources.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
|
| 5 |
+
from cdk_config import ( # Import necessary config
|
| 6 |
+
ALB_NAME,
|
| 7 |
+
AWS_REGION,
|
| 8 |
+
CDK_CONFIG_PATH,
|
| 9 |
+
CDK_FOLDER,
|
| 10 |
+
CODEBUILD_PROJECT_NAME,
|
| 11 |
+
CODEBUILD_ROLE_NAME,
|
| 12 |
+
COGNITO_USER_POOL_CLIENT_NAME,
|
| 13 |
+
COGNITO_USER_POOL_CLIENT_SECRET_NAME,
|
| 14 |
+
COGNITO_USER_POOL_NAME,
|
| 15 |
+
CONTEXT_FILE,
|
| 16 |
+
ECR_CDK_REPO_NAME,
|
| 17 |
+
ECS_TASK_EXECUTION_ROLE_NAME,
|
| 18 |
+
ECS_TASK_ROLE_NAME,
|
| 19 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES,
|
| 20 |
+
PRIVATE_SUBNET_CIDR_BLOCKS,
|
| 21 |
+
PRIVATE_SUBNETS_TO_USE,
|
| 22 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES,
|
| 23 |
+
PUBLIC_SUBNET_CIDR_BLOCKS,
|
| 24 |
+
PUBLIC_SUBNETS_TO_USE,
|
| 25 |
+
S3_LOG_CONFIG_BUCKET_NAME,
|
| 26 |
+
S3_OUTPUT_BUCKET_NAME,
|
| 27 |
+
VPC_NAME,
|
| 28 |
+
WEB_ACL_NAME,
|
| 29 |
+
)
|
| 30 |
+
from cdk_functions import ( # Import your check functions (assuming they use Boto3)
|
| 31 |
+
_get_existing_subnets_in_vpc,
|
| 32 |
+
check_alb_exists,
|
| 33 |
+
check_codebuild_project_exists,
|
| 34 |
+
check_ecr_repo_exists,
|
| 35 |
+
check_for_existing_role,
|
| 36 |
+
check_for_existing_user_pool,
|
| 37 |
+
check_for_existing_user_pool_client,
|
| 38 |
+
check_for_secret,
|
| 39 |
+
check_s3_bucket_exists,
|
| 40 |
+
check_subnet_exists_by_name,
|
| 41 |
+
check_web_acl_exists,
|
| 42 |
+
get_vpc_id_by_name,
|
| 43 |
+
validate_subnet_creation_parameters,
|
| 44 |
+
# Add other check functions as needed
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
cdk_folder = CDK_FOLDER # <FULL_PATH_TO_CDK_FOLDER_HERE>
|
| 48 |
+
|
| 49 |
+
# Full path needed to find config file
|
| 50 |
+
os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# --- Helper to parse environment variables into lists ---
|
| 54 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
| 55 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
| 56 |
+
value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
|
| 57 |
+
if not value:
|
| 58 |
+
return []
|
| 59 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
| 60 |
+
return [s.strip() for s in value.split(",") if s.strip()]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list):
|
| 64 |
+
PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
|
| 65 |
+
if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list):
|
| 66 |
+
PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
|
| 67 |
+
if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list):
|
| 68 |
+
PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
|
| 69 |
+
if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(
|
| 70 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES, list
|
| 71 |
+
):
|
| 72 |
+
PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
|
| 73 |
+
if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list):
|
| 74 |
+
PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
|
| 75 |
+
if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(
|
| 76 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES, list
|
| 77 |
+
):
|
| 78 |
+
PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
|
| 79 |
+
|
| 80 |
+
# Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def check_and_set_context():
|
| 84 |
+
context_data = {}
|
| 85 |
+
|
| 86 |
+
# --- Find the VPC ID first ---
|
| 87 |
+
if VPC_NAME:
|
| 88 |
+
print("VPC_NAME:", VPC_NAME)
|
| 89 |
+
vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
|
| 90 |
+
|
| 91 |
+
# If you expect only one, or one per AZ and you're creating one per AZ in CDK:
|
| 92 |
+
if nat_gateways:
|
| 93 |
+
# For simplicity, let's just check if *any* NAT exists in the VPC
|
| 94 |
+
# A more robust check would match by subnet, AZ, or a specific tag.
|
| 95 |
+
context_data["exists:NatGateway"] = True
|
| 96 |
+
context_data["id:NatGateway"] = nat_gateways[0][
|
| 97 |
+
"NatGatewayId"
|
| 98 |
+
] # Store the ID of the first one found
|
| 99 |
+
else:
|
| 100 |
+
context_data["exists:NatGateway"] = False
|
| 101 |
+
context_data["id:NatGateway"] = None
|
| 102 |
+
|
| 103 |
+
if not vpc_id:
|
| 104 |
+
# If the VPC doesn't exist, you might not be able to check/create subnets.
|
| 105 |
+
# Decide how to handle this: raise an error, set a flag, etc.
|
| 106 |
+
raise RuntimeError(
|
| 107 |
+
f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks."
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
context_data["vpc_id"] = vpc_id # Store VPC ID in context
|
| 111 |
+
|
| 112 |
+
# SUBNET CHECKS
|
| 113 |
+
all_proposed_subnets_data: List[Dict[str, str]] = []
|
| 114 |
+
|
| 115 |
+
# Flag to indicate if full validation mode (with CIDR/AZs) is active
|
| 116 |
+
full_validation_mode = False
|
| 117 |
+
|
| 118 |
+
# Determine if full validation mode is possible/desired
|
| 119 |
+
# It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
|
| 120 |
+
public_ready_for_full_validation = (
|
| 121 |
+
len(PUBLIC_SUBNETS_TO_USE) > 0
|
| 122 |
+
and len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE)
|
| 123 |
+
and len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
|
| 124 |
+
)
|
| 125 |
+
private_ready_for_full_validation = (
|
| 126 |
+
len(PRIVATE_SUBNETS_TO_USE) > 0
|
| 127 |
+
and len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE)
|
| 128 |
+
and len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# Activate full validation if *any* type of subnet (public or private) has its full details provided.
|
| 132 |
+
# You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
|
| 133 |
+
if public_ready_for_full_validation or private_ready_for_full_validation:
|
| 134 |
+
full_validation_mode = True
|
| 135 |
+
|
| 136 |
+
# If some are ready but others aren't, print a warning or raise an error based on your strictness
|
| 137 |
+
if (
|
| 138 |
+
public_ready_for_full_validation
|
| 139 |
+
and not private_ready_for_full_validation
|
| 140 |
+
and PRIVATE_SUBNETS_TO_USE
|
| 141 |
+
):
|
| 142 |
+
print(
|
| 143 |
+
"Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs."
|
| 144 |
+
)
|
| 145 |
+
if (
|
| 146 |
+
private_ready_for_full_validation
|
| 147 |
+
and not public_ready_for_full_validation
|
| 148 |
+
and PUBLIC_SUBNETS_TO_USE
|
| 149 |
+
):
|
| 150 |
+
print(
|
| 151 |
+
"Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs."
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Prepare data for validate_subnet_creation_parameters for all subnets that have full details
|
| 155 |
+
if public_ready_for_full_validation:
|
| 156 |
+
for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
|
| 157 |
+
all_proposed_subnets_data.append(
|
| 158 |
+
{
|
| 159 |
+
"name": name,
|
| 160 |
+
"cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
|
| 161 |
+
"az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
|
| 162 |
+
}
|
| 163 |
+
)
|
| 164 |
+
if private_ready_for_full_validation:
|
| 165 |
+
for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
|
| 166 |
+
all_proposed_subnets_data.append(
|
| 167 |
+
{
|
| 168 |
+
"name": name,
|
| 169 |
+
"cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
|
| 170 |
+
"az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
|
| 171 |
+
}
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
|
| 175 |
+
|
| 176 |
+
# Fetch all existing subnets in the target VPC once to avoid repeated API calls
|
| 177 |
+
try:
|
| 178 |
+
existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
|
| 181 |
+
raise SystemExit(1) # Exit immediately if we can't get baseline data
|
| 182 |
+
|
| 183 |
+
print("\n--- Running Name-Only Subnet Existence Check Mode ---")
|
| 184 |
+
# Fallback: check only by name using the existing data
|
| 185 |
+
checked_public_subnets = {}
|
| 186 |
+
if PUBLIC_SUBNETS_TO_USE:
|
| 187 |
+
for subnet_name in PUBLIC_SUBNETS_TO_USE:
|
| 188 |
+
print("subnet_name:", subnet_name)
|
| 189 |
+
exists, subnet_id = check_subnet_exists_by_name(
|
| 190 |
+
subnet_name, existing_aws_subnets
|
| 191 |
+
)
|
| 192 |
+
checked_public_subnets[subnet_name] = {
|
| 193 |
+
"exists": exists,
|
| 194 |
+
"id": subnet_id,
|
| 195 |
+
"az": (
|
| 196 |
+
existing_aws_subnets["by_name"].get(subnet_name, {}).get("az")
|
| 197 |
+
if exists
|
| 198 |
+
else None
|
| 199 |
+
),
|
| 200 |
+
"route_table_id": (
|
| 201 |
+
existing_aws_subnets["by_name"]
|
| 202 |
+
.get(subnet_name, {})
|
| 203 |
+
.get("route_table_id")
|
| 204 |
+
if exists
|
| 205 |
+
else None
|
| 206 |
+
),
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# If the subnet exists, remove it from the proposed subnets list
|
| 210 |
+
if checked_public_subnets[subnet_name]["exists"] is True:
|
| 211 |
+
all_proposed_subnets_data = [
|
| 212 |
+
subnet
|
| 213 |
+
for subnet in all_proposed_subnets_data
|
| 214 |
+
if subnet["name"] != subnet_name
|
| 215 |
+
]
|
| 216 |
+
|
| 217 |
+
context_data["checked_public_subnets"] = checked_public_subnets
|
| 218 |
+
|
| 219 |
+
checked_private_subnets = {}
|
| 220 |
+
if PRIVATE_SUBNETS_TO_USE:
|
| 221 |
+
for subnet_name in PRIVATE_SUBNETS_TO_USE:
|
| 222 |
+
print("subnet_name:", subnet_name)
|
| 223 |
+
exists, subnet_id = check_subnet_exists_by_name(
|
| 224 |
+
subnet_name, existing_aws_subnets
|
| 225 |
+
)
|
| 226 |
+
checked_private_subnets[subnet_name] = {
|
| 227 |
+
"exists": exists,
|
| 228 |
+
"id": subnet_id,
|
| 229 |
+
"az": (
|
| 230 |
+
existing_aws_subnets["by_name"].get(subnet_name, {}).get("az")
|
| 231 |
+
if exists
|
| 232 |
+
else None
|
| 233 |
+
),
|
| 234 |
+
"route_table_id": (
|
| 235 |
+
existing_aws_subnets["by_name"]
|
| 236 |
+
.get(subnet_name, {})
|
| 237 |
+
.get("route_table_id")
|
| 238 |
+
if exists
|
| 239 |
+
else None
|
| 240 |
+
),
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
# If the subnet exists, remove it from the proposed subnets list
|
| 244 |
+
if checked_private_subnets[subnet_name]["exists"] is True:
|
| 245 |
+
all_proposed_subnets_data = [
|
| 246 |
+
subnet
|
| 247 |
+
for subnet in all_proposed_subnets_data
|
| 248 |
+
if subnet["name"] != subnet_name
|
| 249 |
+
]
|
| 250 |
+
|
| 251 |
+
context_data["checked_private_subnets"] = checked_private_subnets
|
| 252 |
+
|
| 253 |
+
print("\nName-only existence subnet check complete.\n")
|
| 254 |
+
|
| 255 |
+
if full_validation_mode:
|
| 256 |
+
print(
|
| 257 |
+
"\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---"
|
| 258 |
+
)
|
| 259 |
+
try:
|
| 260 |
+
validate_subnet_creation_parameters(
|
| 261 |
+
vpc_id, all_proposed_subnets_data, existing_aws_subnets
|
| 262 |
+
)
|
| 263 |
+
print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
|
| 264 |
+
|
| 265 |
+
# Populate context_data for downstream CDK construct creation.
|
| 266 |
+
# Skip subnets that already exist in AWS (imported in the stack).
|
| 267 |
+
context_data["public_subnets_to_create"] = []
|
| 268 |
+
if public_ready_for_full_validation:
|
| 269 |
+
for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
|
| 270 |
+
if checked_public_subnets.get(name, {}).get("exists"):
|
| 271 |
+
continue
|
| 272 |
+
context_data["public_subnets_to_create"].append(
|
| 273 |
+
{
|
| 274 |
+
"name": name,
|
| 275 |
+
"cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
|
| 276 |
+
"az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
|
| 277 |
+
"is_public": True,
|
| 278 |
+
}
|
| 279 |
+
)
|
| 280 |
+
context_data["private_subnets_to_create"] = []
|
| 281 |
+
if private_ready_for_full_validation:
|
| 282 |
+
for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
|
| 283 |
+
if checked_private_subnets.get(name, {}).get("exists"):
|
| 284 |
+
continue
|
| 285 |
+
context_data["private_subnets_to_create"].append(
|
| 286 |
+
{
|
| 287 |
+
"name": name,
|
| 288 |
+
"cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
|
| 289 |
+
"az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
|
| 290 |
+
"is_public": False,
|
| 291 |
+
}
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
except (ValueError, Exception) as e:
|
| 295 |
+
print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
|
| 296 |
+
raise SystemExit(1) # Exit if validation fails
|
| 297 |
+
|
| 298 |
+
# Example checks and setting context values
|
| 299 |
+
# IAM Roles
|
| 300 |
+
role_name = CODEBUILD_ROLE_NAME
|
| 301 |
+
exists, role_arn, _ = check_for_existing_role(role_name)
|
| 302 |
+
context_data[f"exists:{role_name}"] = exists
|
| 303 |
+
if exists:
|
| 304 |
+
context_data[f"arn:{role_name}"] = role_arn
|
| 305 |
+
|
| 306 |
+
role_name = ECS_TASK_ROLE_NAME
|
| 307 |
+
exists, role_arn, _ = check_for_existing_role(role_name)
|
| 308 |
+
context_data[f"exists:{role_name}"] = exists
|
| 309 |
+
if exists:
|
| 310 |
+
context_data[f"arn:{role_name}"] = role_arn
|
| 311 |
+
|
| 312 |
+
role_name = ECS_TASK_EXECUTION_ROLE_NAME
|
| 313 |
+
exists, role_arn, _ = check_for_existing_role(role_name)
|
| 314 |
+
context_data[f"exists:{role_name}"] = exists
|
| 315 |
+
if exists:
|
| 316 |
+
context_data[f"arn:{role_name}"] = role_arn
|
| 317 |
+
|
| 318 |
+
# S3 Buckets
|
| 319 |
+
bucket_name = S3_LOG_CONFIG_BUCKET_NAME
|
| 320 |
+
exists, _ = check_s3_bucket_exists(bucket_name)
|
| 321 |
+
context_data[f"exists:{bucket_name}"] = exists
|
| 322 |
+
if exists:
|
| 323 |
+
# You might not need the ARN if using from_bucket_name
|
| 324 |
+
pass
|
| 325 |
+
|
| 326 |
+
output_bucket_name = S3_OUTPUT_BUCKET_NAME
|
| 327 |
+
exists, _ = check_s3_bucket_exists(output_bucket_name)
|
| 328 |
+
context_data[f"exists:{output_bucket_name}"] = exists
|
| 329 |
+
if exists:
|
| 330 |
+
pass
|
| 331 |
+
|
| 332 |
+
# ECR Repository
|
| 333 |
+
repo_name = ECR_CDK_REPO_NAME
|
| 334 |
+
exists, _ = check_ecr_repo_exists(repo_name)
|
| 335 |
+
context_data[f"exists:{repo_name}"] = exists
|
| 336 |
+
if exists:
|
| 337 |
+
pass # from_repository_name is sufficient
|
| 338 |
+
|
| 339 |
+
# CodeBuild Project
|
| 340 |
+
project_name = CODEBUILD_PROJECT_NAME
|
| 341 |
+
exists, project_arn, service_role_arn = check_codebuild_project_exists(project_name)
|
| 342 |
+
context_data[f"exists:{project_name}"] = exists
|
| 343 |
+
if exists:
|
| 344 |
+
context_data[f"arn:{project_name}"] = project_arn
|
| 345 |
+
if service_role_arn:
|
| 346 |
+
context_data[f"service_role_arn:{project_name}"] = service_role_arn
|
| 347 |
+
|
| 348 |
+
# ALB (by name lookup) — context keys use the same 32-char name the stack uses
|
| 349 |
+
alb_name = ALB_NAME[-32:] if len(ALB_NAME) > 32 else ALB_NAME
|
| 350 |
+
exists, alb_object = check_alb_exists(alb_name, region_name=AWS_REGION)
|
| 351 |
+
context_data[f"exists:{alb_name}"] = exists
|
| 352 |
+
if exists:
|
| 353 |
+
print("alb_object:", alb_object)
|
| 354 |
+
context_data[f"arn:{alb_name}"] = alb_object["LoadBalancerArn"]
|
| 355 |
+
context_data[f"dns:{alb_name}"] = alb_object["DNSName"]
|
| 356 |
+
context_data[f"canonical_hosted_zone_id:{alb_name}"] = alb_object[
|
| 357 |
+
"CanonicalHostedZoneId"
|
| 358 |
+
]
|
| 359 |
+
if alb_object.get("SecurityGroups"):
|
| 360 |
+
context_data[f"security_group_id:{alb_name}"] = alb_object[
|
| 361 |
+
"SecurityGroups"
|
| 362 |
+
][0]
|
| 363 |
+
|
| 364 |
+
# Cognito User Pool (by name)
|
| 365 |
+
user_pool_name = COGNITO_USER_POOL_NAME
|
| 366 |
+
exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
|
| 367 |
+
context_data[f"exists:{user_pool_name}"] = exists
|
| 368 |
+
if exists:
|
| 369 |
+
context_data[f"id:{user_pool_name}"] = user_pool_id
|
| 370 |
+
|
| 371 |
+
# Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
|
| 372 |
+
if user_pool_id:
|
| 373 |
+
user_pool_id_for_client_check = user_pool_id # context_data.get(f"id:{user_pool_name}") # Use ID from context
|
| 374 |
+
user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
|
| 375 |
+
if user_pool_id_for_client_check:
|
| 376 |
+
exists, client_id, _ = check_for_existing_user_pool_client(
|
| 377 |
+
user_pool_client_name, user_pool_id_for_client_check
|
| 378 |
+
)
|
| 379 |
+
context_data[f"exists:{user_pool_client_name}"] = exists
|
| 380 |
+
if exists:
|
| 381 |
+
context_data[f"id:{user_pool_client_name}"] = client_id
|
| 382 |
+
|
| 383 |
+
# Secrets Manager Secret (by name)
|
| 384 |
+
secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
|
| 385 |
+
exists, _ = check_for_secret(secret_name)
|
| 386 |
+
context_data[f"exists:{secret_name}"] = exists
|
| 387 |
+
# You might not need the ARN if using from_secret_name_v2
|
| 388 |
+
|
| 389 |
+
# WAF Web ACL (by name and scope)
|
| 390 |
+
web_acl_name = WEB_ACL_NAME
|
| 391 |
+
exists, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
|
| 392 |
+
context_data[f"exists:{web_acl_name}"] = exists
|
| 393 |
+
if exists:
|
| 394 |
+
context_data[f"arn:{web_acl_name}"] = existing_web_acl["ARN"]
|
| 395 |
+
|
| 396 |
+
# Write the context data to the file
|
| 397 |
+
with open(CONTEXT_FILE, "w") as f:
|
| 398 |
+
json.dump(context_data, f, indent=2)
|
| 399 |
+
|
| 400 |
+
print(f"Context data written to {CONTEXT_FILE}")
|
cdk/lambda_load_dynamo_logs.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Lambda handler to export DynamoDB usage log table to CSV and upload to S3.
|
| 3 |
+
|
| 4 |
+
All inputs are read from environment variables (no argparse).
|
| 5 |
+
Intended to run as an AWS Lambda function; can also be invoked locally
|
| 6 |
+
by setting env vars and calling lambda_handler({}, None).
|
| 7 |
+
|
| 8 |
+
Environment variables (same semantics as load_dynamo_logs.py CLI):
|
| 9 |
+
DYNAMODB_TABLE_NAME - DynamoDB table name (default: redaction_usage)
|
| 10 |
+
AWS_REGION - AWS region (optional; if unset, uses AWS_DEFAULT_REGION,
|
| 11 |
+
then region from Lambda context ARN, then eu-west-2)
|
| 12 |
+
OUTPUT_FOLDER - Local output directory, e.g. /tmp (optional)
|
| 13 |
+
OUTPUT_FILENAME - Local output file name (default: dynamodb_logs_export.csv)
|
| 14 |
+
OUTPUT - Full local output path (overrides folder + filename if set).
|
| 15 |
+
In Lambda only /tmp is writable; relative paths are auto-resolved to /tmp.
|
| 16 |
+
FROM_DATE - Only include entries on/after this date YYYY-MM-DD (optional)
|
| 17 |
+
TO_DATE - Only include entries on/before this date YYYY-MM-DD (optional)
|
| 18 |
+
DATE_ATTRIBUTE - Attribute name for date filtering (default: timestamp)
|
| 19 |
+
S3_OUTPUT_BUCKET - S3 bucket for the output CSV (required for upload)
|
| 20 |
+
S3_OUTPUT_KEY - S3 object key/path for the output CSV (required for upload)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import csv
|
| 24 |
+
import datetime
|
| 25 |
+
import os
|
| 26 |
+
from decimal import Decimal
|
| 27 |
+
from io import StringIO
|
| 28 |
+
|
| 29 |
+
import boto3
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _get_region_from_context(context):
|
| 33 |
+
"""Extract region from Lambda context invoked_function_arn (arn:aws:lambda:REGION:ACCOUNT:function:NAME)."""
|
| 34 |
+
if context is None:
|
| 35 |
+
return None
|
| 36 |
+
arn = getattr(context, "invoked_function_arn", None)
|
| 37 |
+
if not arn or not isinstance(arn, str):
|
| 38 |
+
return None
|
| 39 |
+
parts = arn.split(":")
|
| 40 |
+
if len(parts) >= 4:
|
| 41 |
+
return parts[3] # region is 4th segment
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_config_from_env(context=None):
|
| 46 |
+
"""Read all settings from environment variables (same inputs as load_dynamo_logs.py).
|
| 47 |
+
When running in Lambda, context can be passed to derive region from the function ARN if env is not set.
|
| 48 |
+
"""
|
| 49 |
+
today = datetime.datetime.now().date()
|
| 50 |
+
one_year_ago = today - datetime.timedelta(days=365)
|
| 51 |
+
|
| 52 |
+
table_name = os.environ.get("DYNAMODB_TABLE_NAME") or os.environ.get(
|
| 53 |
+
"USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage"
|
| 54 |
+
)
|
| 55 |
+
region = (
|
| 56 |
+
os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
|
| 57 |
+
).strip()
|
| 58 |
+
output = os.environ.get("OUTPUT")
|
| 59 |
+
output_folder = os.environ.get("OUTPUT_FOLDER", "output/")
|
| 60 |
+
output_filename = os.environ.get("OUTPUT_FILENAME", "dynamodb_logs_export.csv")
|
| 61 |
+
from_date_str = os.environ.get("FROM_DATE")
|
| 62 |
+
to_date_str = os.environ.get("TO_DATE")
|
| 63 |
+
date_attribute = os.environ.get("DATE_ATTRIBUTE", "timestamp")
|
| 64 |
+
s3_output_bucket = os.environ.get("S3_OUTPUT_BUCKET")
|
| 65 |
+
s3_output_key = os.environ.get("S3_OUTPUT_KEY")
|
| 66 |
+
|
| 67 |
+
if output:
|
| 68 |
+
local_output_path = output
|
| 69 |
+
else:
|
| 70 |
+
folder = output_folder.rstrip("/").rstrip("\\")
|
| 71 |
+
local_output_path = os.path.join(folder, output_filename)
|
| 72 |
+
|
| 73 |
+
# In AWS Lambda only /tmp is writable; resolve relative paths to /tmp to avoid read-only FS errors
|
| 74 |
+
if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"):
|
| 75 |
+
resolved = os.path.abspath(local_output_path)
|
| 76 |
+
if not resolved.startswith("/tmp"):
|
| 77 |
+
local_output_path = os.path.join(
|
| 78 |
+
"/tmp", os.path.basename(local_output_path)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Region: env (AWS_REGION / AWS_DEFAULT_REGION) → Lambda context ARN → hardcoded fallback
|
| 82 |
+
if not region and context is not None:
|
| 83 |
+
region = _get_region_from_context(context) or ""
|
| 84 |
+
if not region:
|
| 85 |
+
region = "FILL IN DEFAULT REGION HERE"
|
| 86 |
+
|
| 87 |
+
from_date = None
|
| 88 |
+
to_date = None
|
| 89 |
+
if from_date_str:
|
| 90 |
+
from_date = datetime.datetime.strptime(from_date_str, "%Y-%m-%d").date()
|
| 91 |
+
if to_date_str:
|
| 92 |
+
to_date = datetime.datetime.strptime(to_date_str, "%Y-%m-%d").date()
|
| 93 |
+
if from_date is None and to_date is None:
|
| 94 |
+
from_date = one_year_ago
|
| 95 |
+
to_date = today
|
| 96 |
+
elif from_date is None:
|
| 97 |
+
from_date = one_year_ago
|
| 98 |
+
elif to_date is None:
|
| 99 |
+
to_date = today
|
| 100 |
+
|
| 101 |
+
return {
|
| 102 |
+
"table_name": table_name,
|
| 103 |
+
"region": region,
|
| 104 |
+
"local_output_path": local_output_path,
|
| 105 |
+
"from_date": from_date,
|
| 106 |
+
"to_date": to_date,
|
| 107 |
+
"date_attribute": date_attribute,
|
| 108 |
+
"s3_output_bucket": s3_output_bucket,
|
| 109 |
+
"s3_output_key": s3_output_key,
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Helper function to convert Decimal to float or int
|
| 114 |
+
def convert_types(item):
|
| 115 |
+
new_item = {}
|
| 116 |
+
for key, value in item.items():
|
| 117 |
+
if isinstance(value, Decimal):
|
| 118 |
+
new_item[key] = int(value) if value % 1 == 0 else float(value)
|
| 119 |
+
elif isinstance(value, str):
|
| 120 |
+
try:
|
| 121 |
+
dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
|
| 122 |
+
new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
| 123 |
+
except (ValueError, TypeError):
|
| 124 |
+
new_item[key] = value
|
| 125 |
+
else:
|
| 126 |
+
new_item[key] = value
|
| 127 |
+
return new_item
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _parse_item_date(value):
|
| 131 |
+
"""Parse a DynamoDB attribute value to datetime for comparison. Returns None if unparseable."""
|
| 132 |
+
if value is None:
|
| 133 |
+
return None
|
| 134 |
+
if isinstance(value, Decimal):
|
| 135 |
+
try:
|
| 136 |
+
return datetime.datetime.utcfromtimestamp(float(value))
|
| 137 |
+
except (ValueError, OSError):
|
| 138 |
+
return None
|
| 139 |
+
if isinstance(value, (int, float)):
|
| 140 |
+
try:
|
| 141 |
+
return datetime.datetime.utcfromtimestamp(float(value))
|
| 142 |
+
except (ValueError, OSError):
|
| 143 |
+
return None
|
| 144 |
+
if isinstance(value, str):
|
| 145 |
+
for fmt in (
|
| 146 |
+
"%Y-%m-%d %H:%M:%S.%f",
|
| 147 |
+
"%Y-%m-%d %H:%M:%S",
|
| 148 |
+
"%Y-%m-%d",
|
| 149 |
+
"%Y-%m-%dT%H:%M:%S",
|
| 150 |
+
):
|
| 151 |
+
try:
|
| 152 |
+
return datetime.datetime.strptime(value, fmt)
|
| 153 |
+
except (ValueError, TypeError):
|
| 154 |
+
continue
|
| 155 |
+
try:
|
| 156 |
+
return datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
|
| 157 |
+
except (ValueError, TypeError):
|
| 158 |
+
pass
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def filter_items_by_date(items, from_date, to_date, date_attribute: str):
|
| 163 |
+
"""Return items whose date attribute falls within [from_date, to_date] (inclusive)."""
|
| 164 |
+
if from_date is None and to_date is None:
|
| 165 |
+
return items
|
| 166 |
+
start = datetime.datetime.combine(from_date, datetime.time.min)
|
| 167 |
+
end = datetime.datetime.combine(to_date, datetime.time.max)
|
| 168 |
+
filtered = []
|
| 169 |
+
for item in items:
|
| 170 |
+
raw = item.get(date_attribute)
|
| 171 |
+
dt = _parse_item_date(raw)
|
| 172 |
+
if dt is None:
|
| 173 |
+
continue
|
| 174 |
+
if dt.tzinfo:
|
| 175 |
+
dt = dt.replace(tzinfo=None)
|
| 176 |
+
if start <= dt <= end:
|
| 177 |
+
filtered.append(item)
|
| 178 |
+
return filtered
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def scan_table(table):
|
| 182 |
+
"""Paginated scan of DynamoDB table."""
|
| 183 |
+
items = []
|
| 184 |
+
response = table.scan()
|
| 185 |
+
items.extend(response["Items"])
|
| 186 |
+
while "LastEvaluatedKey" in response:
|
| 187 |
+
response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
|
| 188 |
+
items.extend(response["Items"])
|
| 189 |
+
return items
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def export_to_csv_buffer(items, fields_to_drop=None):
|
| 193 |
+
"""
|
| 194 |
+
Write items to a CSV in memory; return (csv_string, fieldnames).
|
| 195 |
+
Use for uploading to S3 without writing to disk.
|
| 196 |
+
"""
|
| 197 |
+
if not items:
|
| 198 |
+
return "", []
|
| 199 |
+
|
| 200 |
+
drop_set = set(fields_to_drop or [])
|
| 201 |
+
all_keys = set()
|
| 202 |
+
for item in items:
|
| 203 |
+
all_keys.update(item.keys())
|
| 204 |
+
fieldnames = sorted(list(all_keys - drop_set))
|
| 205 |
+
|
| 206 |
+
buf = StringIO()
|
| 207 |
+
writer = csv.DictWriter(
|
| 208 |
+
buf, fieldnames=fieldnames, extrasaction="ignore", restval=""
|
| 209 |
+
)
|
| 210 |
+
writer.writeheader()
|
| 211 |
+
for item in items:
|
| 212 |
+
writer.writerow(convert_types(item))
|
| 213 |
+
return buf.getvalue(), fieldnames
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def export_to_csv_file(items, output_path, fields_to_drop=None):
|
| 217 |
+
"""Write items to a CSV file (for optional /tmp or local path)."""
|
| 218 |
+
csv_string, _ = export_to_csv_buffer(items, fields_to_drop)
|
| 219 |
+
if not csv_string:
|
| 220 |
+
return
|
| 221 |
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)) or ".", exist_ok=True)
|
| 222 |
+
with open(output_path, "w", newline="", encoding="utf-8-sig") as f:
|
| 223 |
+
f.write(csv_string)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def run_export(config):
|
| 227 |
+
"""
|
| 228 |
+
Run the full export: scan DynamoDB, filter by date, write CSV (buffer and/or file), upload to S3.
|
| 229 |
+
"""
|
| 230 |
+
table_name = config["table_name"]
|
| 231 |
+
region = config["region"]
|
| 232 |
+
local_output_path = config["local_output_path"]
|
| 233 |
+
from_date = config["from_date"]
|
| 234 |
+
to_date = config["to_date"]
|
| 235 |
+
date_attribute = config["date_attribute"]
|
| 236 |
+
s3_output_bucket = config["s3_output_bucket"]
|
| 237 |
+
s3_output_key = config["s3_output_key"]
|
| 238 |
+
|
| 239 |
+
if from_date > to_date:
|
| 240 |
+
raise ValueError("FROM_DATE must be on or before TO_DATE")
|
| 241 |
+
|
| 242 |
+
dynamodb = boto3.resource("dynamodb", region_name=region or None)
|
| 243 |
+
table = dynamodb.Table(table_name)
|
| 244 |
+
|
| 245 |
+
items = scan_table(table)
|
| 246 |
+
items = filter_items_by_date(items, from_date, to_date, date_attribute)
|
| 247 |
+
|
| 248 |
+
csv_string, fieldnames = export_to_csv_buffer(items, fields_to_drop=[])
|
| 249 |
+
result = {
|
| 250 |
+
"item_count": len(items),
|
| 251 |
+
"from_date": str(from_date),
|
| 252 |
+
"to_date": str(to_date),
|
| 253 |
+
"columns": fieldnames,
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
if csv_string:
|
| 257 |
+
# Optional: write to local path (e.g. /tmp in Lambda)
|
| 258 |
+
try:
|
| 259 |
+
export_to_csv_file(items, local_output_path, fields_to_drop=[])
|
| 260 |
+
result["local_path"] = local_output_path
|
| 261 |
+
except Exception as e:
|
| 262 |
+
result["local_write_error"] = str(e)
|
| 263 |
+
|
| 264 |
+
# Upload to S3 if bucket and key are set
|
| 265 |
+
if s3_output_bucket and s3_output_key:
|
| 266 |
+
s3 = boto3.client("s3", region_name=region or None)
|
| 267 |
+
s3.put_object(
|
| 268 |
+
Bucket=s3_output_bucket,
|
| 269 |
+
Key=s3_output_key,
|
| 270 |
+
Body=csv_string.encode("utf-8-sig"),
|
| 271 |
+
ContentType="text/csv; charset=utf-8",
|
| 272 |
+
)
|
| 273 |
+
result["s3_uri"] = f"s3://{s3_output_bucket}/{s3_output_key}"
|
| 274 |
+
elif s3_output_bucket or s3_output_key:
|
| 275 |
+
result["s3_skip_reason"] = (
|
| 276 |
+
"Both S3_OUTPUT_BUCKET and S3_OUTPUT_KEY must be set"
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
return result
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def lambda_handler(event, context):
|
| 283 |
+
"""
|
| 284 |
+
AWS Lambda entrypoint. Config is read from environment variables.
|
| 285 |
+
|
| 286 |
+
Event is not required for config; it can be used to override env vars
|
| 287 |
+
(e.g. pass table_name, from_date, to_date, s3_output_bucket, s3_output_key).
|
| 288 |
+
"""
|
| 289 |
+
config = get_config_from_env(context=context)
|
| 290 |
+
|
| 291 |
+
# Optional: allow event to override env-based config
|
| 292 |
+
if isinstance(event, dict):
|
| 293 |
+
if event.get("table_name"):
|
| 294 |
+
config["table_name"] = event["table_name"]
|
| 295 |
+
if event.get("region"):
|
| 296 |
+
config["region"] = event["region"]
|
| 297 |
+
if event.get("from_date"):
|
| 298 |
+
config["from_date"] = datetime.datetime.strptime(
|
| 299 |
+
event["from_date"], "%Y-%m-%d"
|
| 300 |
+
).date()
|
| 301 |
+
if event.get("to_date"):
|
| 302 |
+
config["to_date"] = datetime.datetime.strptime(
|
| 303 |
+
event["to_date"], "%Y-%m-%d"
|
| 304 |
+
).date()
|
| 305 |
+
if event.get("date_attribute"):
|
| 306 |
+
config["date_attribute"] = event["date_attribute"]
|
| 307 |
+
if event.get("s3_output_bucket"):
|
| 308 |
+
config["s3_output_bucket"] = event["s3_output_bucket"]
|
| 309 |
+
if event.get("s3_output_key"):
|
| 310 |
+
config["s3_output_key"] = event["s3_output_key"]
|
| 311 |
+
|
| 312 |
+
result = run_export(config)
|
| 313 |
+
return {"statusCode": 200, "body": result}
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
if __name__ == "__main__":
|
| 317 |
+
# Allow running locally with env vars set
|
| 318 |
+
import json
|
| 319 |
+
|
| 320 |
+
result = lambda_handler({}, None)
|
| 321 |
+
print(json.dumps(result, indent=2))
|
cdk/post_cdk_build_quickstart.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from cdk_config import (
|
| 4 |
+
CLUSTER_NAME,
|
| 5 |
+
CODEBUILD_PROJECT_NAME,
|
| 6 |
+
ECS_SERVICE_NAME,
|
| 7 |
+
S3_LOG_CONFIG_BUCKET_NAME,
|
| 8 |
+
)
|
| 9 |
+
from cdk_functions import (
|
| 10 |
+
create_basic_config_env,
|
| 11 |
+
start_codebuild_build,
|
| 12 |
+
start_ecs_task,
|
| 13 |
+
upload_file_to_s3,
|
| 14 |
+
)
|
| 15 |
+
from tqdm import tqdm
|
| 16 |
+
|
| 17 |
+
# Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
|
| 18 |
+
create_basic_config_env("config")
|
| 19 |
+
|
| 20 |
+
# Start codebuild build
|
| 21 |
+
print("Starting CodeBuild project.")
|
| 22 |
+
start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
|
| 23 |
+
|
| 24 |
+
# Upload config.env file to S3 bucket
|
| 25 |
+
upload_file_to_s3(
|
| 26 |
+
local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
total_seconds = 660 # 11 minutes
|
| 30 |
+
update_interval = 1 # Update every second
|
| 31 |
+
|
| 32 |
+
print("Waiting 11 minutes for the CodeBuild container to build.")
|
| 33 |
+
|
| 34 |
+
# tqdm iterates over a range, and you perform a small sleep in each iteration
|
| 35 |
+
for i in tqdm(range(total_seconds), desc="Building container"):
|
| 36 |
+
time.sleep(update_interval)
|
| 37 |
+
|
| 38 |
+
# Start task on ECS
|
| 39 |
+
print("Starting ECS task")
|
| 40 |
+
start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)
|
cdk/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aws-cdk-lib==2.257.0
|
| 2 |
+
boto3<=1.42.91
|
| 3 |
+
pandas<=2.3.3
|
| 4 |
+
nodejs<=0.1.1
|
| 5 |
+
python-dotenv<=1.2.2
|
cli_redact.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_redaction/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
doc_redaction package.
|
| 3 |
+
|
| 4 |
+
This package layer is intentionally thin for now: it preserves existing
|
| 5 |
+
repo-root entrypoints (e.g. `app.py`, `cli_redact.py`) while providing stable
|
| 6 |
+
import paths for PyPI installs.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
__all__ = ["__version__", "choose_and_run_redactor", "run_redaction"]
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from importlib.metadata import PackageNotFoundError, version
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
__version__ = version("doc_redaction")
|
| 18 |
+
except PackageNotFoundError: # pragma: no cover
|
| 19 |
+
__version__ = "0.0.0"
|
| 20 |
+
except Exception: # pragma: no cover
|
| 21 |
+
__version__ = "0.0.0"
|
| 22 |
+
|
| 23 |
+
# Convenience re-exports (package-qualified import surface)
|
| 24 |
+
from doc_redaction.file_redaction import (
|
| 25 |
+
choose_and_run_redactor,
|
| 26 |
+
run_redaction,
|
| 27 |
+
) # noqa: E402
|
doc_redaction/api.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stable programmatic API surface matching Gradio `api_name` values.
|
| 3 |
+
|
| 4 |
+
This module provides names that exactly match the Gradio endpoint `api_name`
|
| 5 |
+
strings from `app.py`.
|
| 6 |
+
|
| 7 |
+
By default these names point to the **CLI-first** Python API (`doc_redaction.cli_api`),
|
| 8 |
+
which is the most stable and runnable interface outside Gradio session state.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from doc_redaction.cli_api import (
|
| 14 |
+
apply_review_redactions,
|
| 15 |
+
combine_review_csvs,
|
| 16 |
+
combine_review_pdfs,
|
| 17 |
+
export_review_page_ocr_visualisation,
|
| 18 |
+
export_review_redaction_overlay,
|
| 19 |
+
find_duplicate_pages,
|
| 20 |
+
find_duplicate_tabular,
|
| 21 |
+
load_and_prepare_documents_or_data,
|
| 22 |
+
redact_data,
|
| 23 |
+
redact_document,
|
| 24 |
+
summarise_document,
|
| 25 |
+
verify_redaction_coverage,
|
| 26 |
+
word_level_ocr_text_search,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
__all__ = [
|
| 30 |
+
"redact_document",
|
| 31 |
+
"load_and_prepare_documents_or_data",
|
| 32 |
+
"apply_review_redactions",
|
| 33 |
+
"export_review_page_ocr_visualisation",
|
| 34 |
+
"export_review_redaction_overlay",
|
| 35 |
+
"verify_redaction_coverage",
|
| 36 |
+
"word_level_ocr_text_search",
|
| 37 |
+
"redact_data",
|
| 38 |
+
"find_duplicate_pages",
|
| 39 |
+
"find_duplicate_tabular",
|
| 40 |
+
"summarise_document",
|
| 41 |
+
"combine_review_csvs",
|
| 42 |
+
"combine_review_pdfs",
|
| 43 |
+
]
|
doc_redaction/assets/favicon.png
ADDED
|
|
Git LFS Details
|
doc_redaction/cli_api.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CLI-first programmatic API surface.
|
| 3 |
+
|
| 4 |
+
These functions provide a minimal, runnable Python interface that mirrors the
|
| 5 |
+
Gradio `api_name` routes, but executes the underlying workflows via the CLI
|
| 6 |
+
engine (`cli_redact.main(direct_mode_args=...)`).
|
| 7 |
+
|
| 8 |
+
Return values are lists of output file paths created in `output_dir`.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import tempfile
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Any, Iterable
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _ensure_list(v: str | list[str] | tuple[str, ...]) -> list[str]:
|
| 20 |
+
if isinstance(v, (list, tuple)):
|
| 21 |
+
return [str(x) for x in v]
|
| 22 |
+
return [str(v)]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _snapshot_files(folder: str) -> set[str]:
|
| 26 |
+
root = Path(folder)
|
| 27 |
+
if not root.exists():
|
| 28 |
+
return set()
|
| 29 |
+
out: set[str] = set()
|
| 30 |
+
for dirpath, _, filenames in os.walk(root):
|
| 31 |
+
for name in filenames:
|
| 32 |
+
out.add(str(Path(dirpath) / name))
|
| 33 |
+
return out
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _default_output_dir(prefix: str) -> str:
|
| 37 |
+
return tempfile.mkdtemp(prefix=f"doc_redaction_{prefix}_")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _run_cli(
|
| 41 |
+
*,
|
| 42 |
+
gradio_api_name: str,
|
| 43 |
+
overrides: dict[str, Any],
|
| 44 |
+
output_dir: str | None,
|
| 45 |
+
) -> list[str]:
|
| 46 |
+
"""
|
| 47 |
+
Run cli_redact.main with merged defaults and return newly created files.
|
| 48 |
+
"""
|
| 49 |
+
from cli_redact import get_cli_default_args_dict
|
| 50 |
+
from cli_redact import main as cli_main
|
| 51 |
+
|
| 52 |
+
merged = get_cli_default_args_dict()
|
| 53 |
+
merged.update(overrides)
|
| 54 |
+
|
| 55 |
+
if output_dir is None:
|
| 56 |
+
output_dir = _default_output_dir(gradio_api_name)
|
| 57 |
+
merged["output_dir"] = str(output_dir)
|
| 58 |
+
|
| 59 |
+
before = _snapshot_files(str(output_dir))
|
| 60 |
+
cli_main(direct_mode_args=merged)
|
| 61 |
+
after = _snapshot_files(str(output_dir))
|
| 62 |
+
|
| 63 |
+
created = sorted(after - before)
|
| 64 |
+
return created
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
# Implemented via CLI engine (matches agent_routes.py)
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def redact_document(
|
| 73 |
+
input_files: str | list[str],
|
| 74 |
+
*,
|
| 75 |
+
output_dir: str | None = None,
|
| 76 |
+
ocr_method: str | None = None,
|
| 77 |
+
pii_detector: str | None = None,
|
| 78 |
+
instruction: str | None = None,
|
| 79 |
+
overrides: dict[str, Any] | None = None,
|
| 80 |
+
) -> list[str]:
|
| 81 |
+
"""
|
| 82 |
+
Parity with Gradio `api_name='redact_document'`.
|
| 83 |
+
Runs CLI task `redact` (PDF/PNG/JPG) or relevant workflow based on file type.
|
| 84 |
+
"""
|
| 85 |
+
direct: dict[str, Any] = {
|
| 86 |
+
"task": "redact",
|
| 87 |
+
"input_file": _ensure_list(input_files),
|
| 88 |
+
}
|
| 89 |
+
if ocr_method is not None:
|
| 90 |
+
direct["ocr_method"] = ocr_method
|
| 91 |
+
if pii_detector is not None:
|
| 92 |
+
direct["pii_detector"] = pii_detector
|
| 93 |
+
if instruction is not None:
|
| 94 |
+
direct["custom_llm_instructions"] = instruction
|
| 95 |
+
if overrides:
|
| 96 |
+
direct.update(overrides)
|
| 97 |
+
return _run_cli(
|
| 98 |
+
gradio_api_name="redact_document", overrides=direct, output_dir=output_dir
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def redact_data(
|
| 103 |
+
input_files: str | list[str],
|
| 104 |
+
*,
|
| 105 |
+
output_dir: str | None = None,
|
| 106 |
+
instruction: str | None = None,
|
| 107 |
+
overrides: dict[str, Any] | None = None,
|
| 108 |
+
) -> list[str]:
|
| 109 |
+
"""Parity with Gradio `api_name='redact_data'` (same CLI task: `redact`)."""
|
| 110 |
+
direct: dict[str, Any] = {"task": "redact", "input_file": _ensure_list(input_files)}
|
| 111 |
+
if instruction is not None:
|
| 112 |
+
direct["custom_llm_instructions"] = instruction
|
| 113 |
+
if overrides:
|
| 114 |
+
direct.update(overrides)
|
| 115 |
+
return _run_cli(
|
| 116 |
+
gradio_api_name="redact_data", overrides=direct, output_dir=output_dir
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def find_duplicate_pages(
|
| 121 |
+
input_files: str | list[str],
|
| 122 |
+
*,
|
| 123 |
+
output_dir: str | None = None,
|
| 124 |
+
similarity_threshold: float | None = None,
|
| 125 |
+
min_word_count: int | None = None,
|
| 126 |
+
min_consecutive_pages: int | None = None,
|
| 127 |
+
greedy_match: bool | None = None,
|
| 128 |
+
combine_pages: bool | None = None,
|
| 129 |
+
overrides: dict[str, Any] | None = None,
|
| 130 |
+
) -> list[str]:
|
| 131 |
+
"""Parity with Gradio `api_name='find_duplicate_pages'`."""
|
| 132 |
+
direct: dict[str, Any] = {
|
| 133 |
+
"task": "deduplicate",
|
| 134 |
+
"duplicate_type": "pages",
|
| 135 |
+
"input_file": _ensure_list(input_files),
|
| 136 |
+
}
|
| 137 |
+
if similarity_threshold is not None:
|
| 138 |
+
direct["similarity_threshold"] = similarity_threshold
|
| 139 |
+
if min_word_count is not None:
|
| 140 |
+
direct["min_word_count"] = min_word_count
|
| 141 |
+
if min_consecutive_pages is not None:
|
| 142 |
+
direct["min_consecutive_pages"] = min_consecutive_pages
|
| 143 |
+
if greedy_match is not None:
|
| 144 |
+
direct["greedy_match"] = "True" if greedy_match else "False"
|
| 145 |
+
if combine_pages is not None:
|
| 146 |
+
direct["combine_pages"] = "True" if combine_pages else "False"
|
| 147 |
+
if overrides:
|
| 148 |
+
direct.update(overrides)
|
| 149 |
+
return _run_cli(
|
| 150 |
+
gradio_api_name="find_duplicate_pages", overrides=direct, output_dir=output_dir
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def find_duplicate_tabular(
|
| 155 |
+
input_files: str | list[str],
|
| 156 |
+
*,
|
| 157 |
+
output_dir: str | None = None,
|
| 158 |
+
text_columns: list[str] | None = None,
|
| 159 |
+
similarity_threshold: float | None = None,
|
| 160 |
+
min_word_count: int | None = None,
|
| 161 |
+
overrides: dict[str, Any] | None = None,
|
| 162 |
+
) -> list[str]:
|
| 163 |
+
"""Parity with Gradio `api_name='find_duplicate_tabular'`."""
|
| 164 |
+
direct: dict[str, Any] = {
|
| 165 |
+
"task": "deduplicate",
|
| 166 |
+
"duplicate_type": "tabular",
|
| 167 |
+
"input_file": _ensure_list(input_files),
|
| 168 |
+
}
|
| 169 |
+
if text_columns is not None:
|
| 170 |
+
direct["text_columns"] = list(text_columns)
|
| 171 |
+
if similarity_threshold is not None:
|
| 172 |
+
direct["similarity_threshold"] = similarity_threshold
|
| 173 |
+
if min_word_count is not None:
|
| 174 |
+
direct["min_word_count"] = min_word_count
|
| 175 |
+
if overrides:
|
| 176 |
+
direct.update(overrides)
|
| 177 |
+
return _run_cli(
|
| 178 |
+
gradio_api_name="find_duplicate_tabular",
|
| 179 |
+
overrides=direct,
|
| 180 |
+
output_dir=output_dir,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def summarise_document(
|
| 185 |
+
input_files: str | list[str],
|
| 186 |
+
*,
|
| 187 |
+
output_dir: str | None = None,
|
| 188 |
+
overrides: dict[str, Any] | None = None,
|
| 189 |
+
) -> list[str]:
|
| 190 |
+
"""Parity with Gradio `api_name='summarise_document'` (CLI task: `summarise`)."""
|
| 191 |
+
direct: dict[str, Any] = {
|
| 192 |
+
"task": "summarise",
|
| 193 |
+
"input_file": _ensure_list(input_files),
|
| 194 |
+
}
|
| 195 |
+
if overrides:
|
| 196 |
+
direct.update(overrides)
|
| 197 |
+
return _run_cli(
|
| 198 |
+
gradio_api_name="summarise_document", overrides=direct, output_dir=output_dir
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def combine_review_pdfs(
|
| 203 |
+
input_files: str | list[str],
|
| 204 |
+
*,
|
| 205 |
+
output_dir: str | None = None,
|
| 206 |
+
overrides: dict[str, Any] | None = None,
|
| 207 |
+
) -> list[str]:
|
| 208 |
+
"""Parity with Gradio `api_name='combine_review_pdfs'` (CLI task: `combine_review_pdfs`)."""
|
| 209 |
+
direct: dict[str, Any] = {
|
| 210 |
+
"task": "combine_review_pdfs",
|
| 211 |
+
"input_file": _ensure_list(input_files),
|
| 212 |
+
}
|
| 213 |
+
if overrides:
|
| 214 |
+
direct.update(overrides)
|
| 215 |
+
return _run_cli(
|
| 216 |
+
gradio_api_name="combine_review_pdfs", overrides=direct, output_dir=output_dir
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# ---------------------------------------------------------------------------
|
| 221 |
+
# Implemented without CLI (as per agent_routes.py)
|
| 222 |
+
# ---------------------------------------------------------------------------
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def combine_review_csvs(
|
| 226 |
+
input_files: Iterable[str],
|
| 227 |
+
*,
|
| 228 |
+
output_dir: str | None = None,
|
| 229 |
+
) -> list[str]:
|
| 230 |
+
"""Parity with Gradio `api_name='combine_review_csvs'`."""
|
| 231 |
+
from tools.config import OUTPUT_FOLDER
|
| 232 |
+
from tools.helper_functions import merge_csv_files
|
| 233 |
+
|
| 234 |
+
out_dir = str(output_dir or OUTPUT_FOLDER)
|
| 235 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 236 |
+
sep = "/" if not out_dir.endswith(("/", "\\")) else ""
|
| 237 |
+
|
| 238 |
+
return merge_csv_files([str(p) for p in input_files], output_folder=out_dir + sep)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def export_review_redaction_overlay(
|
| 242 |
+
*,
|
| 243 |
+
page_image_path: str,
|
| 244 |
+
boxes: list[dict[str, Any]],
|
| 245 |
+
page_number: int = 1,
|
| 246 |
+
doc_base_name: str = "review",
|
| 247 |
+
review_df_records: list[dict[str, Any]] | None = None,
|
| 248 |
+
label_abbrev_chars: int | None = None,
|
| 249 |
+
) -> list[str]:
|
| 250 |
+
"""Same behaviour as Gradio ``api_name='page_redaction_review_image'``; Agent API route ``export_review_redaction_overlay``."""
|
| 251 |
+
import pandas as pd
|
| 252 |
+
|
| 253 |
+
from tools.config import OUTPUT_FOLDER
|
| 254 |
+
from tools.redaction_review import visualise_review_redaction_boxes
|
| 255 |
+
|
| 256 |
+
annotator: dict[str, Any] = {"image": page_image_path, "boxes": boxes}
|
| 257 |
+
review_df = pd.DataFrame(review_df_records) if review_df_records else pd.DataFrame()
|
| 258 |
+
|
| 259 |
+
out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve())
|
| 260 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 261 |
+
out_path = visualise_review_redaction_boxes(
|
| 262 |
+
annotator,
|
| 263 |
+
review_df=review_df,
|
| 264 |
+
output_folder=out_dir,
|
| 265 |
+
page_number=page_number,
|
| 266 |
+
doc_base_name=doc_base_name,
|
| 267 |
+
label_abbrev_chars=label_abbrev_chars,
|
| 268 |
+
)
|
| 269 |
+
return [out_path] if out_path else []
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def export_review_page_ocr_visualisation(
|
| 273 |
+
*,
|
| 274 |
+
page_image_path: str,
|
| 275 |
+
ocr_results: dict[str, Any],
|
| 276 |
+
page_number: int = 1,
|
| 277 |
+
doc_base_name: str = "review",
|
| 278 |
+
) -> list[str]:
|
| 279 |
+
"""Same behaviour as Gradio ``api_name='page_ocr_review_image'``; Agent API route ``export_review_page_ocr_visualisation``."""
|
| 280 |
+
from PIL import Image
|
| 281 |
+
|
| 282 |
+
from tools.config import OUTPUT_FOLDER
|
| 283 |
+
from tools.file_redaction import visualise_ocr_words_bounding_boxes
|
| 284 |
+
|
| 285 |
+
out_dir = str(Path(OUTPUT_FOLDER).expanduser().resolve())
|
| 286 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 287 |
+
|
| 288 |
+
image_name = f"{str(doc_base_name or 'review')}_page{int(page_number)}.png"
|
| 289 |
+
log_paths: list[str] = []
|
| 290 |
+
log_paths = visualise_ocr_words_bounding_boxes(
|
| 291 |
+
Image.open(page_image_path).convert("RGB"),
|
| 292 |
+
ocr_results,
|
| 293 |
+
image_name=image_name,
|
| 294 |
+
output_folder=out_dir,
|
| 295 |
+
visualisation_folder="review_ocr_visualisations",
|
| 296 |
+
add_legend=True,
|
| 297 |
+
log_files_output_paths=log_paths,
|
| 298 |
+
)
|
| 299 |
+
return list(log_paths)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
# ---------------------------------------------------------------------------
|
| 303 |
+
# Gradio-session-only (no single CLI task)
|
| 304 |
+
# ---------------------------------------------------------------------------
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def load_and_prepare_documents_or_data(*args: Any, **kwargs: Any) -> list[str]:
|
| 308 |
+
raise NotImplementedError(
|
| 309 |
+
"load_and_prepare_documents_or_data is Gradio-session-state driven and is not exposed as a single CLI task."
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def apply_review_redactions(
|
| 314 |
+
pdf_path: str,
|
| 315 |
+
review_csv_path: str,
|
| 316 |
+
*,
|
| 317 |
+
output_dir: str | None = None,
|
| 318 |
+
input_dir: str | None = None,
|
| 319 |
+
text_extract_method: str | None = None,
|
| 320 |
+
efficient_ocr: bool | None = None,
|
| 321 |
+
) -> list[str]:
|
| 322 |
+
"""
|
| 323 |
+
Headless parity with Gradio ``api_name='apply_review_redactions'``.
|
| 324 |
+
|
| 325 |
+
Returns output file paths (redacted PDF, review CSV, logs, etc.).
|
| 326 |
+
"""
|
| 327 |
+
from tools.simplified_api import run_apply_review_redactions
|
| 328 |
+
|
| 329 |
+
r = run_apply_review_redactions(
|
| 330 |
+
pdf_path=pdf_path,
|
| 331 |
+
review_csv_path=review_csv_path,
|
| 332 |
+
output_dir=output_dir,
|
| 333 |
+
input_dir=input_dir,
|
| 334 |
+
text_extract_method=text_extract_method,
|
| 335 |
+
efficient_ocr=efficient_ocr,
|
| 336 |
+
)
|
| 337 |
+
return list(r.get("output_paths") or [])
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def word_level_ocr_text_search(
|
| 341 |
+
ocr_words_csv_path: str,
|
| 342 |
+
search_text: str,
|
| 343 |
+
*,
|
| 344 |
+
similarity_threshold: float = 1.0,
|
| 345 |
+
use_regex: bool = False,
|
| 346 |
+
review_csv_path: str | None = None,
|
| 347 |
+
) -> dict:
|
| 348 |
+
"""Headless word-level OCR search against ``*_ocr_results_with_words_*.csv``."""
|
| 349 |
+
from tools.verify_redaction_coverage import run_word_level_ocr_text_search
|
| 350 |
+
|
| 351 |
+
return run_word_level_ocr_text_search(
|
| 352 |
+
ocr_words_csv_path,
|
| 353 |
+
search_text,
|
| 354 |
+
similarity_threshold=similarity_threshold,
|
| 355 |
+
use_regex=use_regex,
|
| 356 |
+
review_csv_path=review_csv_path,
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def verify_redaction_coverage(
|
| 361 |
+
review_csv_path: str,
|
| 362 |
+
ocr_words_csv_path: str,
|
| 363 |
+
*,
|
| 364 |
+
must_redact: list[str] | None = None,
|
| 365 |
+
must_not_redact: list[str] | None = None,
|
| 366 |
+
redacted_pdf_path: str | None = None,
|
| 367 |
+
total_pages: int | None = None,
|
| 368 |
+
min_word_length: int = 3,
|
| 369 |
+
sample_pixels: bool = False,
|
| 370 |
+
auto_prune_suspicious: bool = False,
|
| 371 |
+
pruned_output_path: str | None = None,
|
| 372 |
+
) -> dict:
|
| 373 |
+
"""Pass 1 programmatic coverage report (no VLM)."""
|
| 374 |
+
from tools.simplified_api import run_verify_redaction_coverage
|
| 375 |
+
|
| 376 |
+
report, _, _ = run_verify_redaction_coverage(
|
| 377 |
+
review_csv_path,
|
| 378 |
+
ocr_words_csv_path,
|
| 379 |
+
must_redact=must_redact,
|
| 380 |
+
must_not_redact=must_not_redact,
|
| 381 |
+
redacted_pdf_path=redacted_pdf_path,
|
| 382 |
+
total_pages=total_pages,
|
| 383 |
+
min_word_length=min_word_length,
|
| 384 |
+
sample_pixels=sample_pixels,
|
| 385 |
+
auto_prune_suspicious=auto_prune_suspicious,
|
| 386 |
+
pruned_output_path=pruned_output_path,
|
| 387 |
+
)
|
| 388 |
+
return report
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
__all__ = [
|
| 392 |
+
"redact_document",
|
| 393 |
+
"load_and_prepare_documents_or_data",
|
| 394 |
+
"apply_review_redactions",
|
| 395 |
+
"export_review_page_ocr_visualisation",
|
| 396 |
+
"export_review_redaction_overlay",
|
| 397 |
+
"word_level_ocr_text_search",
|
| 398 |
+
"verify_redaction_coverage",
|
| 399 |
+
"redact_data",
|
| 400 |
+
"find_duplicate_pages",
|
| 401 |
+
"find_duplicate_tabular",
|
| 402 |
+
"summarise_document",
|
| 403 |
+
"combine_review_csvs",
|
| 404 |
+
"combine_review_pdfs",
|
| 405 |
+
]
|
doc_redaction/cli_redact.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CLI entrypoint for packaging.
|
| 3 |
+
|
| 4 |
+
Re-exports the existing repo-root `cli_redact.py` implementation so that
|
| 5 |
+
`pyproject.toml` console scripts can target a stable package path.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import importlib
|
| 11 |
+
from typing import Any, Dict
|
| 12 |
+
|
| 13 |
+
_root_cli = importlib.import_module("cli_redact")
|
| 14 |
+
|
| 15 |
+
build_cli_argument_parser = getattr(_root_cli, "build_cli_argument_parser")
|
| 16 |
+
get_cli_default_args_dict = getattr(_root_cli, "get_cli_default_args_dict")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main(direct_mode_args: Dict[str, Any] | None = None):
|
| 20 |
+
# Mirror the root signature but avoid a mutable default.
|
| 21 |
+
if direct_mode_args is None:
|
| 22 |
+
direct_mode_args = {}
|
| 23 |
+
return _root_cli.main(direct_mode_args=direct_mode_args)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
__all__ = ["build_cli_argument_parser", "get_cli_default_args_dict", "main"]
|
doc_redaction/data_anonymise.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Public API wrappers for tabular anonymisation functions.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from tools.data_anonymise import anonymise_files_with_open_text
|
| 8 |
+
|
| 9 |
+
__all__ = ["anonymise_files_with_open_text"]
|
doc_redaction/example_data/Bold minimalist professional cover letter.docx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
|
| 3 |
+
size 23992
|
doc_redaction/example_data/Difficult handwritten note.jpg
ADDED
|
Git LFS Details
|
doc_redaction/example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:caf00ca5cb06b8019804d1a7eaeceec772607969e8cad6c34d1d583876345b90
|
| 3 |
+
size 116763
|
doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
|
| 3 |
+
size 426602
|
doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
another country or territory sign a formel agreement on behalf? of their communities endorsing a
|
| 2 |
+
soster citues international
|
doc_redaction/example_data/combined_case_notes.csv
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Date,Social Worker,Client,Case Note
|
| 2 |
+
"January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
|
| 3 |
+
"January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
|
| 4 |
+
"February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
|
| 5 |
+
"February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
|
| 6 |
+
"March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
|
| 7 |
+
"March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
|
| 8 |
+
"April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
|
| 9 |
+
"April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
|
| 10 |
+
"May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
|
| 11 |
+
"January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
|
| 12 |
+
"January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
|
| 13 |
+
"February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
|
| 14 |
+
"February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
|
| 15 |
+
"March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
|
| 16 |
+
"March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
|
| 17 |
+
"April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
|
| 18 |
+
"April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
|
| 19 |
+
"May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."
|
doc_redaction/example_data/combined_case_notes.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09300597024591d0b5b4ef97faef12fcceb28fcbb6ea09260bc42f43967753a4
|
| 3 |
+
size 12579
|
doc_redaction/example_data/doubled_output_joined.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
|
| 3 |
+
size 1274719
|
doc_redaction/example_data/example_complaint_letter.jpg
ADDED
|
Git LFS Details
|
doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
|
| 3 |
+
size 8848
|
doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
page,text,left,top,width,height,line
|
| 2 |
+
1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
|
| 3 |
+
1,SisterCities,0.169804,0.033333,0.238431,0.028182,2
|
| 4 |
+
1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
|
| 5 |
+
1,Toolkit,0.830588,0.07303,0.126667,0.025152,4
|
| 6 |
+
1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,5
|
| 7 |
+
1,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
|
| 8 |
+
1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
|
| 9 |
+
1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
|
| 10 |
+
1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
|
| 11 |
+
1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
|
| 12 |
+
1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
|
| 13 |
+
1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
|
| 14 |
+
1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
|
| 15 |
+
1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
|
| 16 |
+
1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
|
| 17 |
+
1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
|
| 18 |
+
1,Friendship City,0.118039,0.372121,0.127059,0.013939,17
|
| 19 |
+
1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
|
| 20 |
+
1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
|
| 21 |
+
1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
|
| 22 |
+
1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
|
| 23 |
+
1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
|
| 24 |
+
1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
|
| 25 |
+
1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
|
| 26 |
+
1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
|
| 27 |
+
1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
|
| 28 |
+
1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
|
| 29 |
+
1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
|
| 30 |
+
1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
|
| 31 |
+
1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
|
| 32 |
+
1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
|
| 33 |
+
1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
|
| 34 |
+
1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
|
| 35 |
+
1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
|
| 36 |
+
1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
|
| 37 |
+
1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
|
| 38 |
+
1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
|
| 39 |
+
1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
|
| 40 |
+
1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
|
| 41 |
+
1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
|
| 42 |
+
1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
|
| 43 |
+
1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
|
| 44 |
+
1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
|
| 45 |
+
1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
|
| 46 |
+
1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
|
| 47 |
+
2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
|
| 48 |
+
2,SisterCities,0.169804,0.033333,0.238824,0.028182,2
|
| 49 |
+
2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
|
| 50 |
+
2,Toolkit,0.83098,0.072727,0.127059,0.025455,4
|
| 51 |
+
2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,5
|
| 52 |
+
2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
|
| 53 |
+
2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
|
| 54 |
+
2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
|
| 55 |
+
2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
|
| 56 |
+
2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
|
| 57 |
+
2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
|
| 58 |
+
2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
|
| 59 |
+
2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
|
| 60 |
+
2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
|
| 61 |
+
2,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
|
| 62 |
+
2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
|
| 63 |
+
2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
|
| 64 |
+
2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
|
| 65 |
+
2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
|
| 66 |
+
2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
|
| 67 |
+
2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
|
| 68 |
+
2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
|
| 69 |
+
2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
|
| 70 |
+
2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
|
| 71 |
+
2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
|
| 72 |
+
2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
|
| 73 |
+
2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
|
| 74 |
+
2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
|
| 75 |
+
2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
|
| 76 |
+
2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
|
| 77 |
+
2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
|
| 78 |
+
2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
|
| 79 |
+
2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
|
| 80 |
+
2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
|
| 81 |
+
2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
|
| 82 |
+
2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
|
| 83 |
+
2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
|
| 84 |
+
2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
|
| 85 |
+
2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
|
| 86 |
+
2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
|
| 87 |
+
2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
|
| 88 |
+
2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
|
| 89 |
+
2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
|
| 90 |
+
2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
|
| 91 |
+
2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,1
|
| 92 |
+
2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,2
|
| 93 |
+
3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,3
|
| 94 |
+
3,SisterCities,0.169804,0.033333,0.239216,0.028182,4
|
| 95 |
+
3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,5
|
| 96 |
+
3,Toolkit,0.83098,0.07303,0.126667,0.025152,6
|
| 97 |
+
3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,7
|
| 98 |
+
3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,8
|
| 99 |
+
3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,9
|
| 100 |
+
3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,10
|
| 101 |
+
3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,11
|
| 102 |
+
3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,12
|
| 103 |
+
3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,13
|
| 104 |
+
3,and cooperation.,0.176471,0.25697,0.13451,0.013333,14
|
| 105 |
+
3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,15
|
| 106 |
+
3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,16
|
| 107 |
+
3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,17
|
| 108 |
+
3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,18
|
| 109 |
+
3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,19
|
| 110 |
+
3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,20
|
| 111 |
+
3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,21
|
| 112 |
+
3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,22
|
| 113 |
+
3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,23
|
| 114 |
+
3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,24
|
| 115 |
+
3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,25
|
| 116 |
+
3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,26
|
| 117 |
+
3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,27
|
| 118 |
+
3,for their records.,0.176078,0.550606,0.131373,0.010606,28
|
| 119 |
+
3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,29
|
| 120 |
+
3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,30
|
| 121 |
+
3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,31
|
| 122 |
+
3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,32
|
| 123 |
+
3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,33
|
| 124 |
+
3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,34
|
| 125 |
+
3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,35
|
| 126 |
+
3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,36
|
| 127 |
+
3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,37
|
| 128 |
+
3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,38
|
| 129 |
+
3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,39
|
| 130 |
+
3,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,40
|
| 131 |
+
3,347-8630.,0.117647,0.799394,0.080392,0.010303,41
|
| 132 |
+
4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,1
|
| 133 |
+
4,SisterCities,0.169412,0.033333,0.239608,0.028485,2
|
| 134 |
+
4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
|
| 135 |
+
4,Toolkit,0.830588,0.072727,0.127843,0.025758,4
|
| 136 |
+
4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
|
| 137 |
+
4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
|
| 138 |
+
4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
|
| 139 |
+
4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
|
| 140 |
+
4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
|
| 141 |
+
4,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
|
| 142 |
+
4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
|
| 143 |
+
4,AND,0.487843,0.452727,0.048235,0.011212,12
|
| 144 |
+
4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
|
| 145 |
+
4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
|
| 146 |
+
4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
|
| 147 |
+
4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
|
| 148 |
+
4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
|
| 149 |
+
4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
|
| 150 |
+
4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
|
| 151 |
+
4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
|
| 152 |
+
4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
|
| 153 |
+
4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
|
| 154 |
+
4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
|
| 155 |
+
4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
|
| 156 |
+
4,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
|
| 157 |
+
4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
|
| 158 |
+
4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
|
| 159 |
+
4,A,0.344314,0.768485,0.084706,0.030303,28
|
| 160 |
+
4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
|
| 161 |
+
4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
|
| 162 |
+
4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,31
|
| 163 |
+
4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,32
|
| 164 |
+
4,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
|
| 165 |
+
5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
|
| 166 |
+
5,SisterCities,0.169412,0.033333,0.239608,0.028485,2
|
| 167 |
+
5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
|
| 168 |
+
5,Toolkit,0.83098,0.072727,0.127059,0.025758,4
|
| 169 |
+
5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
|
| 170 |
+
5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
|
| 171 |
+
5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
|
| 172 |
+
5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
|
| 173 |
+
5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
|
| 174 |
+
5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
|
| 175 |
+
5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
|
| 176 |
+
5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
|
| 177 |
+
5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
|
| 178 |
+
5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
|
| 179 |
+
5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
|
| 180 |
+
5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
|
| 181 |
+
5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
|
| 182 |
+
5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
|
| 183 |
+
5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
|
| 184 |
+
5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
|
| 185 |
+
5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
|
| 186 |
+
5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
|
| 187 |
+
5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
|
| 188 |
+
5,the cities;,0.22902,0.624545,0.076471,0.012424,24
|
| 189 |
+
5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
|
| 190 |
+
5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
|
| 191 |
+
5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
|
| 192 |
+
5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
|
| 193 |
+
5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
|
| 194 |
+
5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
|
| 195 |
+
5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
|
| 196 |
+
5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,32
|
| 197 |
+
5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,33
|
| 198 |
+
5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,34
|
| 199 |
+
5,Mayor,0.311373,0.894848,0.053333,0.012727,35
|
| 200 |
+
5,New York City,0.287843,0.909091,0.121176,0.013333,36
|
| 201 |
+
5,London,0.701961,0.909091,0.061569,0.010606,37
|
| 202 |
+
6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,1
|
| 203 |
+
6,SisterCities,0.169412,0.03303,0.24,0.028182,2
|
| 204 |
+
6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
|
| 205 |
+
6,Toolkit,0.83098,0.072727,0.127451,0.025758,4
|
| 206 |
+
6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
|
| 207 |
+
6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
|
| 208 |
+
6,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
|
| 209 |
+
6,California,0.551373,0.257273,0.136471,0.033333,8
|
| 210 |
+
6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
|
| 211 |
+
6,between the,0.464706,0.352727,0.084314,0.009697,10
|
| 212 |
+
6,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
|
| 213 |
+
6,"California, USA",0.4,0.397576,0.21098,0.016061,12
|
| 214 |
+
6,and the,0.48,0.415152,0.053333,0.009091,13
|
| 215 |
+
6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
|
| 216 |
+
6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
|
| 217 |
+
6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
|
| 218 |
+
6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
|
| 219 |
+
6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
|
| 220 |
+
6,purposes:,0.216863,0.516061,0.058039,0.009394,19
|
| 221 |
+
6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
|
| 222 |
+
6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
|
| 223 |
+
6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
|
| 224 |
+
6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
|
| 225 |
+
6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
|
| 226 |
+
6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
|
| 227 |
+
6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
|
| 228 |
+
6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
|
| 229 |
+
6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
|
| 230 |
+
6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
|
| 231 |
+
6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
|
| 232 |
+
6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
|
| 233 |
+
6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
|
| 234 |
+
6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
|
| 235 |
+
6,"California, USA",0.582745,0.765758,0.125098,0.01303,34
|
| 236 |
+
6,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
|
| 237 |
+
6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
|
| 238 |
+
6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
|
| 239 |
+
6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
|
| 240 |
+
6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
|
| 241 |
+
7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
|
| 242 |
+
7,SisterCities,0.169412,0.03303,0.24,0.028485,2
|
| 243 |
+
7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
|
| 244 |
+
7,Toolkit,0.83098,0.072727,0.127451,0.025758,4
|
| 245 |
+
7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
|
| 246 |
+
7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
|
| 247 |
+
7,adopted by,0.2,0.213333,0.080392,0.013636,7
|
| 248 |
+
7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
|
| 249 |
+
7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
|
| 250 |
+
7,and,0.199608,0.260909,0.026275,0.010606,10
|
| 251 |
+
7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
|
| 252 |
+
7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
|
| 253 |
+
7,ON,0.551765,0.298182,0.026667,0.011515,13
|
| 254 |
+
7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
|
| 255 |
+
7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
|
| 256 |
+
7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
|
| 257 |
+
7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
|
| 258 |
+
7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
|
| 259 |
+
7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
|
| 260 |
+
7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
|
| 261 |
+
7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
|
| 262 |
+
7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
|
| 263 |
+
7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
|
| 264 |
+
7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
|
| 265 |
+
7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
|
| 266 |
+
7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
|
| 267 |
+
7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
|
| 268 |
+
7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
|
| 269 |
+
7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
|
| 270 |
+
7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
|
| 271 |
+
7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
|
| 272 |
+
7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
|
| 273 |
+
7,3h.5.,0.593725,0.750606,0.218039,0.06303,33
|
| 274 |
+
7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,34
|
| 275 |
+
7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,35
|
| 276 |
+
7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,36
|
| 277 |
+
7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,37
|
doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image,page,label,color,xmin,ymin,xmax,ymax,id,text
|
| 2 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
|
| 3 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
|
| 4 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
|
| 5 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
|
| 6 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
|
| 7 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,akaplan@sister-cities.org
|
| 8 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
|
| 9 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
|
| 10 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
|
| 11 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
|
| 12 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
|
| 13 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
|
| 14 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
|
| 15 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
|
| 16 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
|
| 17 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
|
| 18 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
|
| 19 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
|
| 20 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
|
| 21 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
|
| 22 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
|
| 23 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
|
| 24 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
|
| 25 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
|
| 26 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
|
| 27 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
|
| 28 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
|
| 29 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
|
| 30 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
|
| 31 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
|
| 32 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
|
| 33 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
|
| 34 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
|
| 35 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
|
| 36 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
|
| 37 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
|
| 38 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
|
| 39 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
|
| 40 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
|
| 41 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
|
| 42 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
|
| 43 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
|
| 44 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
|
| 45 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
|
| 46 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
|
| 47 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
|
| 48 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
|
| 49 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
|
| 50 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
|
| 51 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
|
| 52 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
|
| 53 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
|
| 54 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
|
| 55 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
|
| 56 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
|
| 57 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
|
| 58 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
|
| 59 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
|
| 60 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
|
| 61 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
|
| 62 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
|
| 63 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
|
| 64 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
|
| 65 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
|
| 66 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
|
| 67 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
|
| 68 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
|
| 69 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
|
| 70 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
|
| 71 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
|
| 72 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
|
| 73 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
|
| 74 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
|
| 75 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
|
| 76 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
|
| 77 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY
|
doc_redaction/example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_output_local_ocr.csv
ADDED
|
@@ -0,0 +1,1241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
page,text,left,top,width,height,line,conf
|
| 2 |
+
1,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,99.0
|
| 3 |
+
1,IN,0.17098,0.066667,0.040392,0.009091,2,92.0
|
| 4 |
+
1,Connect,0.167843,0.085152,0.068627,0.015455,3,98.0
|
| 5 |
+
1,Types of,0.115686,0.153939,0.114118,0.024242,4,98.0
|
| 6 |
+
1,Sister City,0.116471,0.185758,0.095686,0.015455,5,97.0
|
| 7 |
+
1,A Sister City,0.116078,0.209394,0.101569,0.015152,6,98.0
|
| 8 |
+
1,"do not take place,",0.116863,0.22697,0.147451,0.013333,7,99.0
|
| 9 |
+
1,another country,0.116863,0.242424,0.132549,0.013939,8,98.0
|
| 10 |
+
1,"""sister city/sister",0.116863,0.258485,0.156078,0.013636,9,99.0
|
| 11 |
+
1,unless otherwise,0.116863,0.274848,0.141961,0.013939,10,99.0
|
| 12 |
+
1,Sister Cities,0.116863,0.298485,0.099608,0.014545,11,99.0
|
| 13 |
+
1,good standing,0.116471,0.314545,0.116078,0.014545,12,99.0
|
| 14 |
+
1,"website. However,",0.115686,0.330606,0.149412,0.014848,13,99.0
|
| 15 |
+
1,legitimacy of,0.116863,0.347576,0.106667,0.013636,14,98.0
|
| 16 |
+
1,Friendship City,0.116078,0.370303,0.129412,0.016061,15,98.0
|
| 17 |
+
1,A Friendship City,0.115686,0.393939,0.139608,0.015758,16,97.0
|
| 18 |
+
1,"stone"" to a more",0.116078,0.410909,0.138039,0.014545,17,99.0
|
| 19 |
+
1,referred to,0.116863,0.427273,0.108235,0.013333,18,99.0
|
| 20 |
+
1,recognize,0.116863,0.443333,0.086275,0.014242,19,98.0
|
| 21 |
+
1,As per Sister,0.117255,0.467576,0.106667,0.013333,20,99.0
|
| 22 |
+
1,Sister,0.167451,0.490606,0.052941,0.014242,21,97.0
|
| 23 |
+
1,U.S. and,0.167843,0.50697,0.070588,0.014242,22,98.0
|
| 24 |
+
1,between,0.167451,0.523333,0.071765,0.013939,23,98.0
|
| 25 |
+
1,cooperative,0.167451,0.539091,0.102745,0.015455,24,99.0
|
| 26 |
+
1,International.,0.168235,0.555758,0.120392,0.013939,25,98.0
|
| 27 |
+
1,is received,0.167451,0.571515,0.094118,0.013636,26,99.0
|
| 28 |
+
1,recognize,0.168235,0.587879,0.085098,0.014545,27,99.0
|
| 29 |
+
1,with a,0.168627,0.604848,0.05098,0.013333,28,99.0
|
| 30 |
+
1,The,0.168235,0.627879,0.034902,0.014545,29,99.0
|
| 31 |
+
1,Mayor/County,0.169804,0.660909,0.116863,0.013939,30,97.0
|
| 32 |
+
1,communities.,0.167843,0.67697,0.107843,0.013939,31,99.0
|
| 33 |
+
1,"from, or",0.168627,0.693333,0.072941,0.013333,32,98.0
|
| 34 |
+
1,of the,0.168627,0.709697,0.053333,0.012424,33,99.0
|
| 35 |
+
1,Although,0.168235,0.725152,0.073333,0.013939,34,99.0
|
| 36 |
+
1,"process,",0.168235,0.741818,0.084706,0.013939,35,98.0
|
| 37 |
+
1,Cities,0.167843,0.75697,0.048235,0.015152,36,98.0
|
| 38 |
+
1,the,0.168627,0.774545,0.029804,0.013333,37,99.0
|
| 39 |
+
1,To place a,0.116078,0.796667,0.087059,0.015152,38,98.0
|
| 40 |
+
1,community must,0.116863,0.813939,0.134902,0.014242,39,98.0
|
| 41 |
+
1,remain sister,0.116863,0.83,0.113333,0.013333,40,98.0
|
| 42 |
+
1,both cities are,0.116471,0.845758,0.12,0.013636,41,98.0
|
| 43 |
+
1,informed in,0.116863,0.861818,0.096863,0.014848,42,98.0
|
| 44 |
+
1,TERN ATION,0.211373,0.066667,0.164314,0.009091,43,92.0
|
| 45 |
+
1,globally. Thrive,0.236471,0.085152,0.121961,0.015455,44,98.0
|
| 46 |
+
1,Affiliations,0.229804,0.153939,0.131373,0.024242,45,98.0
|
| 47 |
+
1,Relationship,0.212157,0.185758,0.103137,0.015455,46,97.0
|
| 48 |
+
1,relationship is,0.217647,0.209394,0.115294,0.015152,47,98.0
|
| 49 |
+
1,highest,0.264314,0.22697,0.064314,0.013333,48,99.0
|
| 50 |
+
1,or territory,0.249412,0.242424,0.121176,0.013939,49,98.0
|
| 51 |
+
1,"cities""",0.272941,0.258485,0.048627,0.013636,50,99.0
|
| 52 |
+
1,indicated,0.258824,0.274848,0.076863,0.013939,51,99.0
|
| 53 |
+
1,International shall,0.216471,0.298485,0.150196,0.014545,52,99.0
|
| 54 |
+
1,(i.e. who are,0.232549,0.314545,0.103137,0.014545,53,99.0
|
| 55 |
+
1,Sister Cities,0.265098,0.330606,0.101569,0.014848,54,99.0
|
| 56 |
+
1,those relationships,0.223529,0.347576,0.157647,0.013636,55,98.0
|
| 57 |
+
1,or Friendship,0.255294,0.393939,0.113725,0.015758,56,97.0
|
| 58 |
+
1,"formal ""Sister",0.254118,0.410909,0.121961,0.014545,57,99.0
|
| 59 |
+
1,as such in the,0.225098,0.427273,0.103922,0.013333,58,99.0
|
| 60 |
+
1,Friendship City,0.203137,0.443333,0.125882,0.014242,59,98.0
|
| 61 |
+
1,Cities International,0.223922,0.467576,0.151373,0.013333,60,99.0
|
| 62 |
+
1,Cities International,0.220392,0.490606,0.156863,0.014242,61,97.0
|
| 63 |
+
1,an international,0.238431,0.50697,0.128235,0.014242,62,98.0
|
| 64 |
+
1,that international,0.239216,0.523333,0.144314,0.013939,63,98.0
|
| 65 |
+
1,agreement,0.270196,0.539091,0.092549,0.015455,64,99.0
|
| 66 |
+
1,If a,0.288627,0.555758,0.029804,0.013939,65,98.0
|
| 67 |
+
1,within a,0.261569,0.571515,0.063529,0.013636,66,99.0
|
| 68 |
+
1,the partnership,0.253333,0.587879,0.130196,0.014545,67,99.0
|
| 69 |
+
1,symbol in the,0.219608,0.604848,0.110196,0.013333,68,99.0
|
| 70 |
+
1,cooperative,0.203137,0.627879,0.102745,0.014545,69,99.0
|
| 71 |
+
1,Executive/Governor,0.169804,0.644848,0.179216,0.013333,70,99.0
|
| 72 |
+
1,Although,0.275686,0.67697,0.077647,0.013939,71,99.0
|
| 73 |
+
1,may be given,0.241569,0.693333,0.108235,0.013333,72,98.0
|
| 74 |
+
1,Mayor/County,0.221961,0.709697,0.12,0.012424,73,99.0
|
| 75 |
+
1,Sister Cities,0.241569,0.725152,0.102745,0.013939,74,99.0
|
| 76 |
+
1,it is up to the,0.252941,0.741818,0.100392,0.013939,75,98.0
|
| 77 |
+
1,International will,0.216078,0.75697,0.139608,0.015152,76,98.0
|
| 78 |
+
1,cooperative agreement.,0.198431,0.774545,0.189804,0.013333,77,99.0
|
| 79 |
+
1,relationship into,0.203137,0.796667,0.133725,0.015152,78,98.0
|
| 80 |
+
1,write a letter,0.251765,0.813939,0.104706,0.014242,79,98.0
|
| 81 |
+
1,"cities, but",0.230196,0.83,0.080392,0.013333,80,98.0
|
| 82 |
+
1,able to sustain,0.236471,0.845758,0.12,0.013636,81,98.0
|
| 83 |
+
1,writing by the,0.213725,0.861818,0.112157,0.014848,82,98.0
|
| 84 |
+
1,AL,0.375686,0.066667,0.032941,0.009091,83,92.0
|
| 85 |
+
1,locally.,0.358431,0.085152,0.051765,0.015455,84,98.0
|
| 86 |
+
1,formed when,0.332941,0.209394,0.110196,0.015152,85,98.0
|
| 87 |
+
1,appointed,0.328627,0.22697,0.083922,0.013333,86,99.0
|
| 88 |
+
1,sign a formal,0.370588,0.242424,0.101569,0.013939,87,98.0
|
| 89 |
+
1,relationship. Sister,0.321569,0.258485,0.17098,0.013636,88,99.0
|
| 90 |
+
1,by one or both,0.335686,0.274848,0.122353,0.013939,89,99.0
|
| 91 |
+
1,formally,0.366667,0.298485,0.071765,0.014545,90,99.0
|
| 92 |
+
1,current on,0.335686,0.314545,0.09098,0.014545,91,99.0
|
| 93 |
+
1,International,0.366667,0.330606,0.108627,0.014848,92,99.0
|
| 94 |
+
1,formed,0.381176,0.347576,0.065098,0.013636,93,98.0
|
| 95 |
+
1,Cities,0.36902,0.393939,0.047843,0.015758,94,97.0
|
| 96 |
+
1,"City""",0.376078,0.410909,0.043922,0.014545,95,99.0
|
| 97 |
+
1,formal documents,0.32902,0.427273,0.146667,0.013333,96,99.0
|
| 98 |
+
1,relationships by,0.32902,0.443333,0.133725,0.014242,97,98.0
|
| 99 |
+
1,Board of,0.375294,0.467576,0.07451,0.013333,98,99.0
|
| 100 |
+
1,will recognize,0.377255,0.490606,0.111765,0.014242,99,97.0
|
| 101 |
+
1,"community,",0.366667,0.50697,0.098824,0.014242,100,98.0
|
| 102 |
+
1,community,0.383529,0.523333,0.089804,0.013939,101,98.0
|
| 103 |
+
1,among all,0.362745,0.539091,0.079216,0.015455,102,99.0
|
| 104 |
+
1,cooperative,0.318431,0.555758,0.114902,0.013939,103,98.0
|
| 105 |
+
1,reasonable amount,0.325098,0.571515,0.157255,0.013636,104,99.0
|
| 106 |
+
1,as a,0.383529,0.587879,0.038039,0.014545,105,99.0
|
| 107 |
+
1,membership,0.329804,0.604848,0.105098,0.013333,106,99.0
|
| 108 |
+
1,agreement must be,0.305882,0.627879,0.158824,0.014545,107,99.0
|
| 109 |
+
1,of the requesting,0.34902,0.644848,0.148627,0.013333,108,99.0
|
| 110 |
+
1,Executive/Governor,0.286667,0.660909,0.164314,0.013939,109,97.0
|
| 111 |
+
1,the Mayor/County,0.353333,0.67697,0.153333,0.013939,110,99.0
|
| 112 |
+
1,"input by, the",0.349804,0.693333,0.100392,0.013333,111,98.0
|
| 113 |
+
1,Executive/Governorto,0.341961,0.709697,0.185098,0.012424,112,99.0
|
| 114 |
+
1,International,0.344314,0.725152,0.131765,0.013939,113,99.0
|
| 115 |
+
1,requesting,0.353333,0.741818,0.093333,0.013939,114,98.0
|
| 116 |
+
1,"not, in any",0.355686,0.75697,0.085098,0.015152,115,98.0
|
| 117 |
+
1,"Emeritus status,",0.336863,0.796667,0.135294,0.015152,116,98.0
|
| 118 |
+
1,to the mayor,0.356471,0.813939,0.106275,0.014242,117,98.0
|
| 119 |
+
1,understand that,0.310588,0.83,0.133333,0.013333,118,98.0
|
| 120 |
+
1,an active,0.356471,0.845758,0.076863,0.013636,119,98.0
|
| 121 |
+
1,mayor of the U.S.,0.325882,0.861818,0.141176,0.014848,120,98.0
|
| 122 |
+
1,the mayor or,0.443137,0.209394,0.107843,0.015152,121,98.0
|
| 123 |
+
1,official) from a,0.412549,0.22697,0.138824,0.013333,122,99.0
|
| 124 |
+
1,agreement,0.472157,0.242424,0.084706,0.013939,123,98.0
|
| 125 |
+
1,city agreements,0.492549,0.258485,0.110588,0.013636,124,99.0
|
| 126 |
+
1,of the respective,0.458039,0.274848,0.162745,0.013939,125,99.0
|
| 127 |
+
1,recognize only,0.438431,0.298485,0.120784,0.014545,126,99.0
|
| 128 |
+
1,membership dues),0.426667,0.314545,0.154118,0.014545,127,99.0
|
| 129 |
+
1,shall not,0.475294,0.330606,0.071373,0.014848,128,99.0
|
| 130 |
+
1,by non-members.,0.446275,0.347576,0.138431,0.013636,129,98.0
|
| 131 |
+
1,relationship is often,0.416863,0.393939,0.16549,0.015758,130,97.0
|
| 132 |
+
1,agreement. Typically,0.42,0.410909,0.167059,0.014545,131,99.0
|
| 133 |
+
1,that are,0.475686,0.427273,0.067843,0.013333,132,99.0
|
| 134 |
+
1,members in,0.462745,0.443333,0.096863,0.014242,133,98.0
|
| 135 |
+
1,Directors:,0.449804,0.467576,0.082353,0.013333,134,99.0
|
| 136 |
+
1,a new sister,0.48902,0.490606,0.101176,0.014242,135,97.0
|
| 137 |
+
1,even though,0.46549,0.50697,0.104706,0.014242,136,98.0
|
| 138 |
+
1,and a,0.473333,0.523333,0.049412,0.013939,137,98.0
|
| 139 |
+
1,involved communities,0.441961,0.539091,0.174902,0.015455,138,99.0
|
| 140 |
+
1,"agreement is denied,",0.433333,0.555758,0.147843,0.013939,139,98.0
|
| 141 |
+
1,"of time,",0.482353,0.571515,0.063529,0.013636,140,99.0
|
| 142 |
+
1,friendship city,0.421569,0.587879,0.142745,0.014545,141,99.0
|
| 143 |
+
1,directories.,0.434902,0.604848,0.093725,0.013333,142,99.0
|
| 144 |
+
1,sent by the,0.464706,0.627879,0.094902,0.014545,143,99.0
|
| 145 |
+
1,"community,",0.497647,0.644848,0.081176,0.013333,144,99.0
|
| 146 |
+
1,of each of the,0.45098,0.660909,0.117647,0.013939,145,97.0
|
| 147 |
+
1,sister cities,0.450196,0.693333,0.1,0.013333,146,98.0
|
| 148 |
+
1,sign,0.527059,0.709697,0.037647,0.012424,147,99.0
|
| 149 |
+
1,will help with,0.476078,0.725152,0.081961,0.013939,148,99.0
|
| 150 |
+
1,community to,0.446667,0.741818,0.109804,0.013939,149,98.0
|
| 151 |
+
1,"way, force a",0.440784,0.75697,0.106275,0.015152,150,98.0
|
| 152 |
+
1,the mayor,0.472157,0.796667,0.086667,0.015152,151,98.0
|
| 153 |
+
1,of the foreign,0.462745,0.813939,0.120784,0.014242,152,98.0
|
| 154 |
+
1,the relationship,0.443922,0.83,0.129804,0.013333,153,98.0
|
| 155 |
+
1,relationship. Sister,0.433333,0.845758,0.155294,0.013636,154,98.0
|
| 156 |
+
1,city of the,0.467059,0.861818,0.087451,0.014848,155,98.0
|
| 157 |
+
1,highest elected,0.55098,0.209394,0.126667,0.015152,156,98.0
|
| 158 |
+
1,U.S. community,0.551373,0.22697,0.122745,0.013333,157,99.0
|
| 159 |
+
1,on behalf of,0.556863,0.242424,0.116078,0.013939,158,98.0
|
| 160 |
+
1,shall be,0.603137,0.258485,0.063529,0.013636,159,99.0
|
| 161 |
+
1,communities.,0.620784,0.274848,0.084314,0.013939,160,99.0
|
| 162 |
+
1,those relationships,0.559216,0.298485,0.15451,0.014545,161,99.0
|
| 163 |
+
1,in its,0.580784,0.314545,0.042353,0.014545,162,99.0
|
| 164 |
+
1,assert as invalid,0.546667,0.330606,0.138039,0.014848,163,99.0
|
| 165 |
+
1,formed by,0.582353,0.393939,0.088627,0.015758,164,97.0
|
| 166 |
+
1,Friendship,0.587059,0.410909,0.089412,0.014545,165,99.0
|
| 167 |
+
1,signed. Sister,0.543529,0.427273,0.11098,0.013333,166,99.0
|
| 168 |
+
1,its Membership,0.559608,0.443333,0.126275,0.014242,167,98.0
|
| 169 |
+
1,cities,0.590196,0.490606,0.046275,0.014242,168,97.0
|
| 170 |
+
1,another,0.570196,0.50697,0.067451,0.014242,169,98.0
|
| 171 |
+
1,different U.S.,0.522745,0.523333,0.116471,0.013939,170,98.0
|
| 172 |
+
1,is filed,0.616863,0.539091,0.060784,0.015455,171,99.0
|
| 173 |
+
1,or no response,0.581176,0.555758,0.127059,0.013939,172,98.0
|
| 174 |
+
1,Sister Cities,0.545882,0.571515,0.102353,0.013636,173,99.0
|
| 175 |
+
1,and it will be,0.564314,0.587879,0.079608,0.014545,174,99.0
|
| 176 |
+
1,Mayor/County,0.559608,0.627879,0.116863,0.014545,175,99.0
|
| 177 |
+
1,and must be,0.578824,0.644848,0.100784,0.013333,176,99.0
|
| 178 |
+
1,existing,0.568627,0.660909,0.064314,0.013939,177,97.0
|
| 179 |
+
1,Executive/Governor,0.506667,0.67697,0.165882,0.013939,178,99.0
|
| 180 |
+
1,"program, it is",0.550196,0.693333,0.11098,0.013333,179,98.0
|
| 181 |
+
1,the cooperative,0.564706,0.709697,0.130196,0.012424,180,99.0
|
| 182 |
+
1,the cooperative,0.558039,0.725152,0.131765,0.013939,181,99.0
|
| 183 |
+
1,get the agreement,0.556471,0.741818,0.152941,0.013939,182,98.0
|
| 184 |
+
1,community to,0.547059,0.75697,0.108627,0.015152,183,98.0
|
| 185 |
+
1,or highest,0.558824,0.796667,0.086275,0.015152,184,98.0
|
| 186 |
+
1,city indicating,0.583529,0.813939,0.114118,0.014242,185,98.0
|
| 187 |
+
1,will remain,0.573725,0.83,0.088235,0.013333,186,98.0
|
| 188 |
+
1,Cities,0.588627,0.845758,0.048235,0.013636,187,98.0
|
| 189 |
+
1,situation. Sister,0.55451,0.861818,0.128235,0.014848,188,98.0
|
| 190 |
+
1,"official (or,",0.677647,0.209394,0.103137,0.015152,189,98.0
|
| 191 |
+
1,and a community,0.674118,0.22697,0.139608,0.013333,190,99.0
|
| 192 |
+
1,their communities,0.672941,0.242424,0.114902,0.013939,191,98.0
|
| 193 |
+
1,considered,0.666667,0.258485,0.092549,0.013636,192,99.0
|
| 194 |
+
1,by,0.713725,0.298485,0.023529,0.014545,193,99.0
|
| 195 |
+
1,Membership Directory,0.623137,0.314545,0.183529,0.014545,194,99.0
|
| 196 |
+
1,or otherwise,0.684706,0.330606,0.135294,0.014848,195,99.0
|
| 197 |
+
1,cities as a,0.67098,0.393939,0.085098,0.015758,196,97.0
|
| 198 |
+
1,City agreements,0.676471,0.410909,0.133725,0.014545,197,99.0
|
| 199 |
+
1,Cities International,0.65451,0.427273,0.155686,0.013333,198,99.0
|
| 200 |
+
1,Directory,0.685882,0.443333,0.083137,0.014242,199,98.0
|
| 201 |
+
1,affiliation between,0.636471,0.490606,0.151373,0.014242,200,97.0
|
| 202 |
+
1,affiliation may exist,0.637647,0.50697,0.158824,0.014242,201,98.0
|
| 203 |
+
1,"community, only",0.639216,0.523333,0.131373,0.013939,202,98.0
|
| 204 |
+
1,with Sister,0.677647,0.539091,0.091373,0.015455,203,99.0
|
| 205 |
+
1,to the,0.708235,0.555758,0.048627,0.013939,204,98.0
|
| 206 |
+
1,International will,0.648235,0.571515,0.13451,0.013636,205,99.0
|
| 207 |
+
1,delineated as such,0.643922,0.587879,0.147059,0.014545,206,99.0
|
| 208 |
+
1,sent to the,0.679608,0.644848,0.085098,0.013333,207,99.0
|
| 209 |
+
1,partnership,0.632941,0.660909,0.092941,0.013939,208,97.0
|
| 210 |
+
1,may request,0.672549,0.67697,0.106275,0.013939,209,99.0
|
| 211 |
+
1,up to the discretion,0.661176,0.693333,0.154902,0.013333,210,98.0
|
| 212 |
+
1,agreement.,0.694902,0.709697,0.088235,0.012424,211,99.0
|
| 213 |
+
1,agreement,0.689804,0.725152,0.085882,0.013939,212,99.0
|
| 214 |
+
1,signed.,0.709412,0.741818,0.060784,0.013939,213,98.0
|
| 215 |
+
1,"""share"" and sign",0.655686,0.75697,0.138039,0.015152,214,98.0
|
| 216 |
+
1,elected official,0.645098,0.796667,0.128235,0.015152,215,98.0
|
| 217 |
+
1,that they,0.697647,0.813939,0.075294,0.014242,216,98.0
|
| 218 |
+
1,inactive until,0.661961,0.83,0.104706,0.013333,217,98.0
|
| 219 |
+
1,International should,0.636863,0.845758,0.168235,0.013636,218,98.0
|
| 220 |
+
1,Cities,0.682745,0.861818,0.048235,0.014848,219,98.0
|
| 221 |
+
1,if elections,0.780784,0.209394,0.076471,0.015152,220,98.0
|
| 222 |
+
1,in,0.813725,0.22697,0.013725,0.013333,221,99.0
|
| 223 |
+
1,endorsing,0.787843,0.242424,0.083529,0.013939,222,98.0
|
| 224 |
+
1,active/valid,0.759216,0.258485,0.091373,0.013636,223,99.0
|
| 225 |
+
1,cities/members,0.737255,0.298485,0.127451,0.014545,224,99.0
|
| 226 |
+
1,or on its,0.806667,0.314545,0.067059,0.014545,225,99.0
|
| 227 |
+
1,impugn the,0.82,0.330606,0.057647,0.014848,226,99.0
|
| 228 |
+
1,"""stepping",0.756078,0.393939,0.076471,0.015758,227,97.0
|
| 229 |
+
1,are,0.810196,0.410909,0.028235,0.014545,228,99.0
|
| 230 |
+
1,shall,0.810196,0.427273,0.035686,0.013333,229,99.0
|
| 231 |
+
1,and website.,0.76902,0.443333,0.096863,0.014242,230,98.0
|
| 232 |
+
1,a,0.787843,0.490606,0.009412,0.014242,231,97.0
|
| 233 |
+
1,if a,0.770588,0.523333,0.028627,0.013939,232,98.0
|
| 234 |
+
1,Cities,0.76902,0.539091,0.043922,0.015455,233,99.0
|
| 235 |
+
1,request,0.756863,0.555758,0.061961,0.013939,234,98.0
|
| 236 |
+
1,input,0.778824,0.67697,0.038824,0.013939,235,99.0
|
| 237 |
+
1,Sister,0.770196,0.741818,0.049804,0.013939,236,98.0
|
| 238 |
+
1,of the U.S.,0.773333,0.796667,0.081176,0.015152,237,98.0
|
| 239 |
+
1,wish to,0.772941,0.813939,0.054902,0.014242,238,98.0
|
| 240 |
+
1,such time as,0.766667,0.83,0.099608,0.013333,239,98.0
|
| 241 |
+
1,be,0.805098,0.845758,0.018824,0.013636,240,98.0
|
| 242 |
+
1,International will,0.73098,0.861818,0.134118,0.014848,241,98.0
|
| 243 |
+
1,Toolkit,0.829412,0.069394,0.132549,0.031515,242,99.0
|
| 244 |
+
1,a,0.871373,0.242424,0.00902,0.013939,243,98.0
|
| 245 |
+
1,in,0.864706,0.298485,0.012941,0.014545,244,99.0
|
| 246 |
+
2,SisterCities Partnership Agreement,0.167843,0.025455,0.793725,0.03697,1,99.0
|
| 247 |
+
2,IN TE,0.17098,0.066667,0.072157,0.009091,2,87.0
|
| 248 |
+
2,Connect,0.168627,0.085758,0.067451,0.014242,3,99.0
|
| 249 |
+
2,then place the,0.116471,0.130909,0.115294,0.014242,4,98.0
|
| 250 |
+
2,lists of sister,0.116471,0.147576,0.109804,0.014545,5,97.0
|
| 251 |
+
2,If a community,0.116863,0.171515,0.12549,0.014242,6,98.0
|
| 252 |
+
2,highest elected,0.116863,0.187273,0.124314,0.014242,7,98.0
|
| 253 |
+
2,Cities International,0.116863,0.203636,0.159216,0.014242,8,97.0
|
| 254 |
+
2,and Sister Cities,0.116863,0.220303,0.132549,0.012727,9,98.0
|
| 255 |
+
2,of sister city,0.117647,0.237273,0.119216,0.012424,10,99.0
|
| 256 |
+
2,dormant. Many,0.116471,0.252424,0.122745,0.013939,11,98.0
|
| 257 |
+
2,partnership may,0.116471,0.267879,0.135294,0.015758,12,99.0
|
| 258 |
+
2,General,0.117647,0.293636,0.106275,0.018182,13,99.0
|
| 259 |
+
2,In order for a,0.117255,0.32303,0.119216,0.014848,14,98.0
|
| 260 |
+
2,"(SCl), the two",0.118039,0.339394,0.115686,0.014242,15,98.0
|
| 261 |
+
2,presumes,0.116471,0.355758,0.085882,0.014848,16,99.0
|
| 262 |
+
2,followed proper,0.116078,0.371515,0.13451,0.015152,17,99.0
|
| 263 |
+
2,with the specific,0.117255,0.388485,0.136471,0.013636,18,98.0
|
| 264 |
+
2,and that both,0.116863,0.405152,0.108235,0.013333,19,98.0
|
| 265 |
+
2,should check,0.116078,0.420303,0.10902,0.015152,20,98.0
|
| 266 |
+
2,before pursuing,0.117255,0.437576,0.133725,0.013636,21,99.0
|
| 267 |
+
2,SCl often refers,0.116078,0.459697,0.147843,0.014848,22,99.0
|
| 268 |
+
2,"Understanding.""",0.116471,0.476364,0.137647,0.014848,23,99.0
|
| 269 |
+
2,your documents,0.116863,0.493939,0.134902,0.013333,24,99.0
|
| 270 |
+
2,A few things to,0.116471,0.516667,0.122353,0.014545,25,99.0
|
| 271 |
+
2,Your,0.176078,0.541515,0.044314,0.013636,26,98.0
|
| 272 |
+
2,commitment,0.174902,0.557273,0.101961,0.014242,27,98.0
|
| 273 |
+
2,with,0.175294,0.573939,0.034118,0.013333,28,99.0
|
| 274 |
+
2,related,0.176078,0.590303,0.063137,0.013333,29,99.0
|
| 275 |
+
2,Don't try,0.176078,0.619394,0.072941,0.013939,30,99.0
|
| 276 |
+
2,interest,0.175686,0.636061,0.068627,0.013333,31,99.0
|
| 277 |
+
2,include,0.174902,0.651818,0.061569,0.013939,32,99.0
|
| 278 |
+
2,the scope,0.175294,0.668788,0.081569,0.013333,33,98.0
|
| 279 |
+
2,"tasks,",0.174902,0.684848,0.051765,0.01303,34,99.0
|
| 280 |
+
2,administration,0.175294,0.700909,0.114902,0.013636,35,98.0
|
| 281 |
+
2,memorandum,0.175294,0.71697,0.112549,0.013636,36,99.0
|
| 282 |
+
2,agreement,0.175686,0.73303,0.092941,0.013939,37,98.0
|
| 283 |
+
2,with very,0.175294,0.749697,0.076863,0.013333,38,98.0
|
| 284 |
+
2,Work,0.175686,0.778485,0.047059,0.013636,39,99.0
|
| 285 |
+
2,share,0.175686,0.795152,0.05098,0.013333,40,99.0
|
| 286 |
+
2,what,0.175294,0.811212,0.041569,0.013939,41,99.0
|
| 287 |
+
2,Ask your,0.175686,0.840606,0.075294,0.013636,42,98.0
|
| 288 |
+
2,important,0.176078,0.85697,0.079608,0.013333,43,99.0
|
| 289 |
+
2,the,0.174902,0.87303,0.03098,0.014242,44,99.0
|
| 290 |
+
2,RN A TIO,0.243137,0.066667,0.108235,0.009091,45,87.0
|
| 291 |
+
2,globally. Thrive,0.236078,0.085758,0.121569,0.014242,46,99.0
|
| 292 |
+
2,partnership,0.231765,0.130909,0.100392,0.014242,47,98.0
|
| 293 |
+
2,city programs.,0.226275,0.147576,0.111765,0.014545,48,97.0
|
| 294 |
+
2,wishes to,0.242353,0.171515,0.078039,0.014242,49,98.0
|
| 295 |
+
2,official of,0.241176,0.187273,0.106667,0.014242,50,98.0
|
| 296 |
+
2,should,0.276078,0.203636,0.056863,0.014242,51,97.0
|
| 297 |
+
2,International,0.249412,0.220303,0.10902,0.012727,52,98.0
|
| 298 |
+
2,programs.,0.236863,0.237273,0.096471,0.012424,53,99.0
|
| 299 |
+
2,partnerships,0.239216,0.252424,0.111373,0.013939,54,98.0
|
| 300 |
+
2,be reinvigorated,0.251765,0.267879,0.136863,0.015758,55,99.0
|
| 301 |
+
2,Guidelines,0.223922,0.293636,0.125882,0.018182,56,99.0
|
| 302 |
+
2,sister,0.236471,0.32303,0.057255,0.014848,57,98.0
|
| 303 |
+
2,communities,0.233725,0.339394,0.103529,0.014242,58,98.0
|
| 304 |
+
2,several key,0.202353,0.355758,0.116863,0.014848,59,99.0
|
| 305 |
+
2,procedures,0.250588,0.371515,0.128627,0.015152,60,99.0
|
| 306 |
+
2,city);,0.253725,0.388485,0.082745,0.013636,61,98.0
|
| 307 |
+
2,have secured,0.225098,0.405152,0.115294,0.013333,62,98.0
|
| 308 |
+
2,with your local,0.225098,0.420303,0.120392,0.015152,63,98.0
|
| 309 |
+
2,a sister city,0.25098,0.437576,0.098039,0.013636,64,99.0
|
| 310 |
+
2,to these,0.263922,0.459697,0.080392,0.014848,65,99.0
|
| 311 |
+
2,"However,",0.254118,0.476364,0.080392,0.014848,66,99.0
|
| 312 |
+
2,is left up to,0.251765,0.493939,0.097255,0.013333,67,99.0
|
| 313 |
+
2,keep in mind,0.238824,0.516667,0.106275,0.014545,68,99.0
|
| 314 |
+
2,agreement can,0.220392,0.541515,0.124314,0.013636,69,98.0
|
| 315 |
+
2,to fostering,0.276863,0.557273,0.099608,0.014242,70,98.0
|
| 316 |
+
2,particular areas,0.209412,0.573939,0.134118,0.013333,71,99.0
|
| 317 |
+
2,to anything,0.239216,0.590303,0.093333,0.013333,72,99.0
|
| 318 |
+
2,to include,0.24902,0.619394,0.080784,0.013939,73,99.0
|
| 319 |
+
2,or participating,0.244314,0.636061,0.13098,0.013333,74,99.0
|
| 320 |
+
2,all the programs,0.236471,0.651818,0.139608,0.013939,75,99.0
|
| 321 |
+
2,of projects.,0.256863,0.668788,0.098039,0.013333,76,98.0
|
| 322 |
+
2,"responsibilities,",0.226667,0.684848,0.12902,0.01303,77,99.0
|
| 323 |
+
2,of the,0.290196,0.700909,0.052549,0.013636,78,98.0
|
| 324 |
+
2,between,0.287843,0.71697,0.074118,0.013636,79,99.0
|
| 325 |
+
2,is a,0.268627,0.73303,0.033333,0.013939,80,98.0
|
| 326 |
+
2,specific,0.252157,0.749697,0.068627,0.013333,81,98.0
|
| 327 |
+
2,with your,0.222745,0.778485,0.080392,0.013636,82,99.0
|
| 328 |
+
2,drafts of,0.226667,0.795152,0.106667,0.013333,83,99.0
|
| 329 |
+
2,they'd like to see,0.216863,0.811212,0.139608,0.013939,84,99.0
|
| 330 |
+
2,counterparts,0.25098,0.840606,0.108627,0.013636,85,98.0
|
| 331 |
+
2,for the,0.255686,0.85697,0.062745,0.013333,86,99.0
|
| 332 |
+
2,commitment their,0.205882,0.87303,0.146667,0.014242,87,99.0
|
| 333 |
+
2,N A L,0.351373,0.066667,0.057255,0.009091,88,87.0
|
| 334 |
+
2,locally.,0.357647,0.085758,0.051373,0.014242,89,99.0
|
| 335 |
+
2,into Emeritus,0.332157,0.130909,0.108627,0.014242,90,98.0
|
| 336 |
+
2,terminate a,0.320392,0.171515,0.096863,0.014242,91,98.0
|
| 337 |
+
2,the U.S. city,0.347843,0.187273,0.076078,0.014242,92,98.0
|
| 338 |
+
2,be informed,0.332941,0.203636,0.102745,0.014242,93,97.0
|
| 339 |
+
2,will then,0.358431,0.220303,0.065882,0.012727,94,98.0
|
| 340 |
+
2,We do not,0.333333,0.237273,0.06549,0.012424,95,99.0
|
| 341 |
+
2,wax and,0.350588,0.252424,0.070196,0.013939,96,98.0
|
| 342 |
+
2,by local,0.388627,0.267879,0.064314,0.015758,97,99.0
|
| 343 |
+
2,city/county/state,0.293725,0.32303,0.144314,0.014848,98,98.0
|
| 344 |
+
2,must sign,0.337255,0.339394,0.081569,0.014242,99,98.0
|
| 345 |
+
2,items: that the,0.319216,0.355758,0.109412,0.014848,100,99.0
|
| 346 |
+
2,(e.g. passed,0.379216,0.371515,0.073725,0.015152,101,99.0
|
| 347 |
+
2,that both,0.336471,0.388485,0.066275,0.013636,102,98.0
|
| 348 |
+
2,the necessary,0.340392,0.405152,0.120784,0.013333,103,98.0
|
| 349 |
+
2,sister city,0.34549,0.420303,0.084706,0.015152,104,98.0
|
| 350 |
+
2,relationship.,0.34902,0.437576,0.093725,0.013636,105,99.0
|
| 351 |
+
2,agreements as,0.344314,0.459697,0.101569,0.014848,106,99.0
|
| 352 |
+
2,as the following,0.33451,0.476364,0.133725,0.014848,107,99.0
|
| 353 |
+
2,you.,0.34902,0.493939,0.031765,0.013333,108,99.0
|
| 354 |
+
2,as you draft,0.345098,0.516667,0.101176,0.014545,109,99.0
|
| 355 |
+
2,range from,0.344706,0.541515,0.117647,0.013636,110,98.0
|
| 356 |
+
2,"of interest,",0.343529,0.573939,0.092157,0.013333,111,99.0
|
| 357 |
+
2,from numbers,0.332549,0.590303,0.132549,0.013333,112,99.0
|
| 358 |
+
2,everything,0.329804,0.619394,0.091373,0.013939,113,99.0
|
| 359 |
+
2,institutions,0.375294,0.636061,0.109804,0.013333,114,99.0
|
| 360 |
+
2,you plan,0.376078,0.651818,0.072941,0.013939,115,99.0
|
| 361 |
+
2,This is a,0.354902,0.668788,0.070588,0.013333,116,98.0
|
| 362 |
+
2,or other,0.355686,0.684848,0.072941,0.01303,117,99.0
|
| 363 |
+
2,partnership,0.342745,0.700909,0.098824,0.013636,118,98.0
|
| 364 |
+
2,the respective,0.361961,0.71697,0.118431,0.013636,119,99.0
|
| 365 |
+
2,historical,0.301961,0.73303,0.10902,0.013939,120,98.0
|
| 366 |
+
2,tasks.,0.320784,0.749697,0.045882,0.013333,121,98.0
|
| 367 |
+
2,counterparts.,0.303137,0.778485,0.136471,0.013636,122,99.0
|
| 368 |
+
2,your agreement,0.333333,0.795152,0.128235,0.013333,123,99.0
|
| 369 |
+
2,in the,0.356471,0.811212,0.049412,0.013939,124,99.0
|
| 370 |
+
2,to translate,0.359608,0.840606,0.094902,0.013636,125,98.0
|
| 371 |
+
2,citizens of,0.318431,0.85697,0.119216,0.013333,126,99.0
|
| 372 |
+
2,city has,0.352549,0.87303,0.07098,0.014242,127,99.0
|
| 373 |
+
2,Status and,0.440784,0.130909,0.08902,0.014242,128,98.0
|
| 374 |
+
2,sister city,0.417255,0.171515,0.082745,0.014242,129,98.0
|
| 375 |
+
2,should be sent,0.423922,0.187273,0.120784,0.014242,130,98.0
|
| 376 |
+
2,of this action,0.435686,0.203636,0.110588,0.014242,131,97.0
|
| 377 |
+
2,remove the,0.424314,0.220303,0.094902,0.012727,132,98.0
|
| 378 |
+
2,recommend,0.398824,0.237273,0.098039,0.012424,133,99.0
|
| 379 |
+
2,wane over the,0.420784,0.252424,0.12,0.013939,134,98.0
|
| 380 |
+
2,members,0.452941,0.267879,0.077255,0.015758,135,99.0
|
| 381 |
+
2,partnership,0.438039,0.32303,0.087451,0.014848,136,98.0
|
| 382 |
+
2,formal documents,0.418824,0.339394,0.153333,0.014242,137,98.0
|
| 383 |
+
2,U.S. community,0.428627,0.355758,0.12,0.014848,138,99.0
|
| 384 |
+
2,a city council,0.452941,0.371515,0.108627,0.015152,139,99.0
|
| 385 |
+
2,communities share,0.402745,0.388485,0.137647,0.013636,140,98.0
|
| 386 |
+
2,support,0.461176,0.405152,0.068235,0.013333,141,98.0
|
| 387 |
+
2,program to,0.430196,0.420303,0.09451,0.015152,142,98.0
|
| 388 |
+
2,"a ""Sister",0.445882,0.459697,0.100392,0.014848,143,99.0
|
| 389 |
+
2,examples,0.468235,0.476364,0.081176,0.014848,144,99.0
|
| 390 |
+
2,your agreement:,0.446275,0.516667,0.134902,0.014545,145,99.0
|
| 391 |
+
2,"the ceremonial,",0.462353,0.541515,0.107451,0.013636,146,98.0
|
| 392 |
+
2,"understanding,",0.376471,0.557273,0.125098,0.014242,147,98.0
|
| 393 |
+
2,specific,0.435686,0.573939,0.067843,0.013333,148,99.0
|
| 394 |
+
2,of,0.465098,0.590303,0.037255,0.013333,149,99.0
|
| 395 |
+
2,you plan to do.,0.421176,0.619394,0.121569,0.013939,150,99.0
|
| 396 |
+
2,are good,0.485098,0.636061,0.067451,0.013333,151,99.0
|
| 397 |
+
2,to do if it,0.44902,0.651818,0.078431,0.013939,152,99.0
|
| 398 |
+
2,formal document,0.42549,0.668788,0.139216,0.013333,153,98.0
|
| 399 |
+
2,nuts-and-bolts,0.428627,0.684848,0.125882,0.01303,154,99.0
|
| 400 |
+
2,can be,0.441569,0.700909,0.056471,0.013636,155,98.0
|
| 401 |
+
2,sister,0.480392,0.71697,0.052157,0.013636,156,99.0
|
| 402 |
+
2,document and should,0.41098,0.73303,0.149412,0.013939,157,98.0
|
| 403 |
+
2,Remember that,0.439608,0.778485,0.108235,0.013636,158,99.0
|
| 404 |
+
2,with your,0.461569,0.795152,0.05098,0.013333,159,99.0
|
| 405 |
+
2,agreement. Be,0.405882,0.811212,0.122745,0.013939,160,99.0
|
| 406 |
+
2,the agreement,0.45451,0.840606,0.116471,0.013636,161,98.0
|
| 407 |
+
2,your partner,0.437647,0.85697,0.09098,0.013333,162,99.0
|
| 408 |
+
2,made. Have,0.423529,0.87303,0.098039,0.014242,163,99.0
|
| 409 |
+
2,will reflect,0.529804,0.130909,0.089412,0.014242,164,98.0
|
| 410 |
+
2,"relationship, then",0.5,0.171515,0.141176,0.014242,165,98.0
|
| 411 |
+
2,to the mayor,0.544706,0.187273,0.106667,0.014242,166,98.0
|
| 412 |
+
2,in writing,0.546275,0.203636,0.076863,0.014242,167,97.0
|
| 413 |
+
2,partnership,0.519216,0.220303,0.096863,0.012727,168,98.0
|
| 414 |
+
2,terminating a,0.496863,0.237273,0.109804,0.012424,169,99.0
|
| 415 |
+
2,"years, and",0.540784,0.252424,0.09098,0.013939,170,98.0
|
| 416 |
+
2,years after,0.530196,0.267879,0.100392,0.015758,171,99.0
|
| 417 |
+
2,to be recognized,0.52549,0.32303,0.12902,0.014848,172,98.0
|
| 418 |
+
2,which,0.572157,0.339394,0.050196,0.014242,173,98.0
|
| 419 |
+
2,is already,0.548627,0.355758,0.085882,0.014848,174,99.0
|
| 420 |
+
2,resolution,0.561569,0.371515,0.08549,0.015152,175,99.0
|
| 421 |
+
2,a mutual,0.540392,0.388485,0.067451,0.013636,176,98.0
|
| 422 |
+
2,structure to,0.529412,0.405152,0.109804,0.013333,177,98.0
|
| 423 |
+
2,see if they,0.524706,0.420303,0.09098,0.015152,178,98.0
|
| 424 |
+
2,"City Agreement""",0.546275,0.459697,0.110196,0.014848,179,99.0
|
| 425 |
+
2,"show, the",0.549412,0.476364,0.081569,0.014848,180,99.0
|
| 426 |
+
2,with,0.569804,0.541515,0.035686,0.013636,181,98.0
|
| 427 |
+
2,"cooperation, and",0.501569,0.557273,0.14,0.014242,182,98.0
|
| 428 |
+
2,"programs/activities,",0.503529,0.573939,0.162745,0.013333,183,99.0
|
| 429 |
+
2,exchanges to,0.502353,0.590303,0.114118,0.013333,184,99.0
|
| 430 |
+
2,Some,0.542745,0.619394,0.052157,0.013939,185,99.0
|
| 431 |
+
2,to include.,0.552549,0.636061,0.076078,0.013333,186,99.0
|
| 432 |
+
2,makes the,0.527451,0.651818,0.087451,0.013939,187,99.0
|
| 433 |
+
2,to establish,0.564706,0.668788,0.096471,0.013333,188,98.0
|
| 434 |
+
2,text related,0.55451,0.684848,0.107059,0.01303,189,99.0
|
| 435 |
+
2,expressed more,0.498039,0.700909,0.134902,0.013636,190,98.0
|
| 436 |
+
2,city committees.,0.532549,0.71697,0.132941,0.013636,191,99.0
|
| 437 |
+
2,not be,0.560392,0.73303,0.055294,0.013939,192,98.0
|
| 438 |
+
2,this is signed,0.547843,0.778485,0.11098,0.013636,193,99.0
|
| 439 |
+
2,international,0.512549,0.795152,0.101569,0.013333,194,99.0
|
| 440 |
+
2,flexible to,0.528627,0.811212,0.082353,0.013939,195,99.0
|
| 441 |
+
2,if it is,0.57098,0.840606,0.049412,0.013636,196,98.0
|
| 442 |
+
2,community to,0.528627,0.85697,0.095294,0.013333,197,99.0
|
| 443 |
+
2,someone in,0.521569,0.87303,0.098039,0.014242,198,99.0
|
| 444 |
+
2,this status in,0.619216,0.130909,0.106275,0.014242,199,98.0
|
| 445 |
+
2,a letter from,0.641176,0.171515,0.107059,0.014242,200,98.0
|
| 446 |
+
2,of the,0.651373,0.187273,0.051373,0.014242,201,98.0
|
| 447 |
+
2,by the mayor,0.623137,0.203636,0.109804,0.014242,202,97.0
|
| 448 |
+
2,from its,0.616078,0.220303,0.068627,0.012727,203,98.0
|
| 449 |
+
2,relationship,0.606667,0.237273,0.1,0.012424,204,99.0
|
| 450 |
+
2,in many cases,0.631765,0.252424,0.118824,0.013939,205,98.0
|
| 451 |
+
2,it has been,0.630588,0.267879,0.089804,0.015758,206,99.0
|
| 452 |
+
2,by Sister,0.65451,0.32303,0.073725,0.014848,207,98.0
|
| 453 |
+
2,clearly,0.622353,0.339394,0.061961,0.014242,208,98.0
|
| 454 |
+
2,a member,0.63451,0.355758,0.084314,0.014848,209,99.0
|
| 455 |
+
2,declaring,0.647059,0.371515,0.08,0.015152,210,99.0
|
| 456 |
+
2,commitment to,0.607843,0.388485,0.118824,0.013636,211,98.0
|
| 457 |
+
2,build a lasting,0.639216,0.405152,0.105882,0.013333,212,98.0
|
| 458 |
+
2,have any,0.615686,0.420303,0.076471,0.015152,213,98.0
|
| 459 |
+
2,or,0.656471,0.459697,0.053333,0.014848,214,99.0
|
| 460 |
+
2,actual name,0.63098,0.476364,0.102745,0.014848,215,99.0
|
| 461 |
+
2,language focusing,0.60549,0.541515,0.152549,0.013636,216,98.0
|
| 462 |
+
2,mutual,0.641569,0.557273,0.057647,0.014242,217,98.0
|
| 463 |
+
2,or more,0.666275,0.573939,0.067843,0.013333,218,99.0
|
| 464 |
+
2,economic development.,0.616471,0.590303,0.158039,0.013333,219,99.0
|
| 465 |
+
2,"specifics, like",0.594902,0.619394,0.112941,0.013939,220,99.0
|
| 466 |
+
2,"However,",0.628627,0.636061,0.078431,0.013333,221,99.0
|
| 467 |
+
2,document too,0.614902,0.651818,0.112941,0.013939,222,99.0
|
| 468 |
+
2,the,0.661176,0.668788,0.030588,0.013333,223,98.0
|
| 469 |
+
2,to,0.661569,0.684848,0.012941,0.01303,224,99.0
|
| 470 |
+
2,fully in a,0.632941,0.700909,0.072549,0.013636,225,98.0
|
| 471 |
+
2,Your,0.66549,0.71697,0.044314,0.013636,226,99.0
|
| 472 |
+
2,dated or limited,0.615686,0.73303,0.130588,0.013939,227,98.0
|
| 473 |
+
2,by both,0.658824,0.778485,0.065098,0.013636,228,99.0
|
| 474 |
+
2,partners and,0.614118,0.795152,0.107843,0.013333,229,99.0
|
| 475 |
+
2,cultural or,0.61098,0.811212,0.089412,0.013939,230,99.0
|
| 476 |
+
2,drafted in,0.620392,0.840606,0.083529,0.013636,231,98.0
|
| 477 |
+
2,be able to read,0.623922,0.85697,0.123529,0.013333,232,99.0
|
| 478 |
+
2,your own,0.619608,0.87303,0.082745,0.014242,233,99.0
|
| 479 |
+
2,directories,0.72549,0.130909,0.094118,0.014242,234,98.0
|
| 480 |
+
2,the mayor,0.748235,0.171515,0.084706,0.014242,235,98.0
|
| 481 |
+
2,sister city. Sister,0.702745,0.187273,0.137647,0.014242,236,98.0
|
| 482 |
+
2,of the U.S.,0.732941,0.203636,0.08549,0.014242,237,97.0
|
| 483 |
+
2,directories and all,0.684706,0.220303,0.147843,0.012727,238,98.0
|
| 484 |
+
2,simply because,0.706667,0.237273,0.127059,0.012424,239,99.0
|
| 485 |
+
2,a dormant,0.750588,0.252424,0.081961,0.013939,240,98.0
|
| 486 |
+
2,inactive.,0.720392,0.267879,0.063529,0.015758,241,99.0
|
| 487 |
+
2,Cities International,0.728235,0.32303,0.146667,0.014848,242,98.0
|
| 488 |
+
2,endorse the link.,0.684314,0.339394,0.139608,0.014242,243,98.0
|
| 489 |
+
2,of SCl and has,0.718824,0.355758,0.118039,0.014848,244,99.0
|
| 490 |
+
2,the intent to,0.727059,0.371515,0.098824,0.015152,245,99.0
|
| 491 |
+
2,the relationship;,0.726667,0.388485,0.132941,0.013636,246,98.0
|
| 492 |
+
2,relationship.,0.745098,0.405152,0.101961,0.013333,247,98.0
|
| 493 |
+
2,additional requirements,0.692157,0.420303,0.187059,0.015152,248,98.0
|
| 494 |
+
2,"""Memorandum of",0.709804,0.459697,0.106667,0.014848,249,99.0
|
| 495 |
+
2,and format,0.733725,0.476364,0.096471,0.014848,250,99.0
|
| 496 |
+
2,on each,0.758039,0.541515,0.067843,0.013636,251,98.0
|
| 497 |
+
2,benefit to the,0.699216,0.557273,0.113725,0.014242,252,98.0
|
| 498 |
+
2,concrete,0.734118,0.573939,0.075294,0.013333,253,99.0
|
| 499 |
+
2,particular areas,0.707843,0.619394,0.131373,0.013939,254,99.0
|
| 500 |
+
2,there's no need,0.707059,0.636061,0.130196,0.013333,255,99.0
|
| 501 |
+
2,lengthy or,0.727843,0.651818,0.088627,0.013939,256,99.0
|
| 502 |
+
2,relationship;,0.691765,0.668788,0.104706,0.013333,257,98.0
|
| 503 |
+
2,implementation or,0.67451,0.684848,0.137647,0.01303,258,99.0
|
| 504 |
+
2,separate,0.70549,0.700909,0.071373,0.013636,259,98.0
|
| 505 |
+
2,partnership,0.709804,0.71697,0.095294,0.013636,260,99.0
|
| 506 |
+
2,by being,0.746275,0.73303,0.072941,0.013939,261,98.0
|
| 507 |
+
2,cities. You,0.723922,0.778485,0.086275,0.013636,262,99.0
|
| 508 |
+
2,solicit feedback,0.721961,0.795152,0.128627,0.013333,263,99.0
|
| 509 |
+
2,municipal priorities.,0.700392,0.811212,0.156078,0.013939,264,99.0
|
| 510 |
+
2,English. It is,0.703922,0.840606,0.095686,0.013636,265,98.0
|
| 511 |
+
2,and understand,0.747451,0.85697,0.124314,0.013333,266,99.0
|
| 512 |
+
2,community who,0.702353,0.87303,0.124314,0.014242,267,99.0
|
| 513 |
+
2,and all,0.819608,0.130909,0.050588,0.014242,268,98.0
|
| 514 |
+
2,or,0.832941,0.171515,0.018824,0.014242,269,98.0
|
| 515 |
+
2,city,0.818431,0.203636,0.028627,0.014242,270,97.0
|
| 516 |
+
2,lists,0.832549,0.220303,0.031765,0.012727,271,98.0
|
| 517 |
+
2,it is,0.833725,0.237273,0.027843,0.012424,272,99.0
|
| 518 |
+
2,This,0.823922,0.339394,0.034118,0.014242,273,98.0
|
| 519 |
+
2,twin,0.825882,0.371515,0.030588,0.015152,274,99.0
|
| 520 |
+
2,You,0.847059,0.405152,0.029412,0.013333,275,98.0
|
| 521 |
+
2,of,0.830196,0.476364,0.019608,0.014848,276,99.0
|
| 522 |
+
2,city's,0.825882,0.541515,0.041176,0.013636,277,98.0
|
| 523 |
+
2,"precise,",0.812941,0.557273,0.06549,0.014242,278,98.0
|
| 524 |
+
2,goals,0.809412,0.573939,0.040784,0.013333,279,99.0
|
| 525 |
+
2,of,0.839216,0.619394,0.019216,0.013939,280,99.0
|
| 526 |
+
2,to,0.837255,0.636061,0.015294,0.013333,281,99.0
|
| 527 |
+
2,limits,0.816471,0.651818,0.04,0.013939,282,99.0
|
| 528 |
+
2,specific,0.796471,0.668788,0.064706,0.013333,283,98.0
|
| 529 |
+
2,aligned,0.819216,0.73303,0.057255,0.013939,284,98.0
|
| 530 |
+
2,should,0.810196,0.778485,0.052941,0.013636,285,99.0
|
| 531 |
+
2,on,0.850588,0.795152,0.018431,0.013333,286,99.0
|
| 532 |
+
2,Toolkit,0.82902,0.069394,0.132941,0.031515,287,99.0
|
| 533 |
+
3,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,99.0
|
| 534 |
+
3,akaplan@sister-cities.org,0.442745,0.781818,0.206275,0.014848,2,98.0
|
| 535 |
+
3,INTERN,0.17098,0.066667,0.110196,0.009091,3,93.0
|
| 536 |
+
3,Connect,0.168627,0.085455,0.067451,0.014848,4,99.0
|
| 537 |
+
3,speaks,0.176078,0.131818,0.061569,0.013636,5,97.0
|
| 538 |
+
3,you have,0.175294,0.148788,0.078431,0.013333,6,97.0
|
| 539 |
+
3,Keep it to,0.175686,0.17697,0.080392,0.014242,7,99.0
|
| 540 |
+
3,work best,0.174902,0.193636,0.083529,0.014242,8,99.0
|
| 541 |
+
3,Most sister,0.175294,0.222727,0.096078,0.013939,9,99.0
|
| 542 |
+
3,of the,0.174902,0.239091,0.054902,0.014242,10,99.0
|
| 543 |
+
3,and,0.176078,0.256364,0.03451,0.012727,11,97.0
|
| 544 |
+
3,Consider,0.175686,0.284545,0.075686,0.014242,12,98.0
|
| 545 |
+
3,logos to,0.176078,0.301212,0.066667,0.013939,13,99.0
|
| 546 |
+
3,city hall or,0.175294,0.317576,0.087451,0.013939,14,99.0
|
| 547 |
+
3,Look at,0.17451,0.346364,0.064706,0.014848,15,98.0
|
| 548 |
+
3,of what is,0.174902,0.362727,0.083137,0.014545,16,99.0
|
| 549 |
+
3,cannot,0.174118,0.378485,0.058824,0.015758,17,99.0
|
| 550 |
+
3,have them,0.174902,0.395758,0.086275,0.014242,18,98.0
|
| 551 |
+
3,Documents,0.175294,0.424848,0.093725,0.014545,19,98.0
|
| 552 |
+
3,Check with,0.174902,0.453939,0.091373,0.015455,20,98.0
|
| 553 |
+
3,is OK with,0.17451,0.470606,0.083137,0.014545,21,98.0
|
| 554 |
+
3,don't want,0.175686,0.487576,0.087059,0.013636,22,98.0
|
| 555 |
+
3,Official,0.175294,0.516364,0.064706,0.014242,23,98.0
|
| 556 |
+
3,partnership.,0.175294,0.533333,0.103922,0.013333,24,99.0
|
| 557 |
+
3,for their,0.174902,0.549394,0.069804,0.012424,25,99.0
|
| 558 |
+
3,Remember,0.175294,0.578485,0.089804,0.013636,26,98.0
|
| 559 |
+
3,receive,0.175686,0.595152,0.065882,0.013333,27,99.0
|
| 560 |
+
3,it is included,0.17451,0.610303,0.104314,0.015455,28,98.0
|
| 561 |
+
3,Remember that,0.116863,0.639091,0.125882,0.015152,29,99.0
|
| 562 |
+
3,like the establishment,0.116863,0.655152,0.178431,0.015455,30,98.0
|
| 563 |
+
3,"others, before",0.116863,0.672424,0.122745,0.013939,31,99.0
|
| 564 |
+
3,office to see if,0.117255,0.689091,0.146667,0.012727,32,99.0
|
| 565 |
+
3,On the following,0.116471,0.716364,0.131373,0.015152,33,99.0
|
| 566 |
+
3,what is possible.,0.116471,0.733636,0.13451,0.013939,34,99.0
|
| 567 |
+
3,encourage you to,0.116471,0.749394,0.145098,0.014545,35,98.0
|
| 568 |
+
3,you are unsure,0.115686,0.766061,0.129412,0.013939,36,99.0
|
| 569 |
+
3,sending it to our,0.115686,0.781818,0.132941,0.014848,37,98.0
|
| 570 |
+
3,347-8630.,0.116078,0.798182,0.083529,0.012727,38,99.0
|
| 571 |
+
3,ATION,0.281176,0.066667,0.093333,0.009091,39,93.0
|
| 572 |
+
3,globally. Thrive,0.236078,0.085455,0.121569,0.014848,40,99.0
|
| 573 |
+
3,that language,0.237647,0.131818,0.110196,0.013636,41,97.0
|
| 574 |
+
3,in your own,0.253725,0.148788,0.098824,0.013333,42,97.0
|
| 575 |
+
3,one page.,0.256078,0.17697,0.081961,0.014242,43,99.0
|
| 576 |
+
3,if they can,0.258431,0.193636,0.090196,0.014242,44,99.0
|
| 577 |
+
3,city,0.271373,0.222727,0.034118,0.013939,45,99.0
|
| 578 |
+
3,sister city,0.229804,0.239091,0.088235,0.014242,46,99.0
|
| 579 |
+
3,cooperation.,0.210588,0.256364,0.101569,0.012727,47,97.0
|
| 580 |
+
3,using official,0.251373,0.284545,0.109804,0.014242,48,98.0
|
| 581 |
+
3,reflect your,0.242745,0.301212,0.112157,0.013939,49,99.0
|
| 582 |
+
3,other municipal,0.262745,0.317576,0.123922,0.013939,50,99.0
|
| 583 |
+
3,other agreements,0.239216,0.346364,0.150196,0.014848,51,98.0
|
| 584 |
+
3,acceptable,0.258039,0.362727,0.09098,0.014545,52,99.0
|
| 585 |
+
3,access older,0.232941,0.378485,0.109804,0.015758,53,99.0
|
| 586 |
+
3,"on file,",0.261176,0.395758,0.059608,0.014242,54,98.0
|
| 587 |
+
3,must be,0.26902,0.424848,0.069804,0.014545,55,98.0
|
| 588 |
+
3,"your mayor,",0.266275,0.453939,0.104314,0.015455,56,98.0
|
| 589 |
+
3,them. The,0.257647,0.470606,0.085882,0.014545,57,98.0
|
| 590 |
+
3,to spend,0.262745,0.487576,0.076078,0.013636,58,98.0
|
| 591 |
+
3,documents are,0.24,0.516364,0.122745,0.014242,59,98.0
|
| 592 |
+
3,Be sure,0.279216,0.533333,0.068627,0.013333,60,99.0
|
| 593 |
+
3,records.,0.244706,0.549394,0.063922,0.012424,61,99.0
|
| 594 |
+
3,to send your,0.265098,0.578485,0.10549,0.013636,62,98.0
|
| 595 |
+
3,your agreement,0.241569,0.595152,0.133725,0.013333,63,99.0
|
| 596 |
+
3,in our,0.278824,0.610303,0.052941,0.015455,64,98.0
|
| 597 |
+
3,each city's,0.242745,0.639091,0.090588,0.015152,65,99.0
|
| 598 |
+
3,of a,0.295294,0.655152,0.039216,0.015455,66,98.0
|
| 599 |
+
3,sanctioning a,0.239608,0.672424,0.10902,0.013939,67,99.0
|
| 600 |
+
3,this is the case.,0.263922,0.689091,0.094902,0.012727,68,99.0
|
| 601 |
+
3,pages you'll,0.247843,0.716364,0.101569,0.015152,69,99.0
|
| 602 |
+
3,While you,0.25098,0.733636,0.083529,0.013939,70,99.0
|
| 603 |
+
3,make your,0.261569,0.749394,0.089412,0.014545,71,98.0
|
| 604 |
+
3,about your,0.245098,0.766061,0.091765,0.013939,72,99.0
|
| 605 |
+
3,Membership,0.248627,0.781818,0.104706,0.014848,73,98.0
|
| 606 |
+
3,AL,0.37451,0.066667,0.034118,0.009091,74,93.0
|
| 607 |
+
3,locally.,0.357647,0.085455,0.051373,0.014848,75,99.0
|
| 608 |
+
3,check the,0.347843,0.131818,0.083529,0.013636,76,97.0
|
| 609 |
+
3,agreement.,0.352549,0.148788,0.090196,0.013333,77,97.0
|
| 610 |
+
3,Ceremonial,0.338039,0.17697,0.096863,0.014242,78,99.0
|
| 611 |
+
3,be posted in,0.348627,0.193636,0.1,0.014242,79,99.0
|
| 612 |
+
3,agreements include,0.30549,0.222727,0.162353,0.013939,80,99.0
|
| 613 |
+
3,movement-to,0.318039,0.239091,0.11098,0.014242,81,99.0
|
| 614 |
+
3,letterhead,0.361176,0.284545,0.086275,0.014242,82,98.0
|
| 615 |
+
3,enhance the,0.354902,0.301212,0.094118,0.013939,83,99.0
|
| 616 |
+
3,offices,0.386667,0.317576,0.063922,0.013939,84,99.0
|
| 617 |
+
3,your city,0.389412,0.346364,0.077647,0.014848,85,98.0
|
| 618 |
+
3,"or possible,",0.34902,0.362727,0.096863,0.014545,86,99.0
|
| 619 |
+
3,agreements,0.342745,0.378485,0.100392,0.015758,87,99.0
|
| 620 |
+
3,although we do,0.320784,0.395758,0.126667,0.014242,88,98.0
|
| 621 |
+
3,signed by the,0.338824,0.424848,0.113333,0.014545,89,98.0
|
| 622 |
+
3,"city council,",0.370588,0.453939,0.099608,0.015455,90,98.0
|
| 623 |
+
3,mayor is the,0.343529,0.470606,0.103529,0.014545,91,98.0
|
| 624 |
+
3,time developing,0.338824,0.487576,0.131373,0.013636,92,98.0
|
| 625 |
+
3,usually signed,0.362745,0.516364,0.116471,0.014242,93,98.0
|
| 626 |
+
3,both communities,0.347843,0.533333,0.14549,0.013333,94,99.0
|
| 627 |
+
3,signed,0.370588,0.578485,0.057647,0.013636,95,98.0
|
| 628 |
+
3,we will post,0.375294,0.595152,0.092941,0.013333,96,99.0
|
| 629 |
+
3,Annual Membership,0.331765,0.610303,0.161569,0.015455,97,98.0
|
| 630 |
+
3,sister city,0.333333,0.639091,0.087059,0.015152,98,99.0
|
| 631 |
+
3,"committee, a",0.33451,0.655152,0.109804,0.015455,99,98.0
|
| 632 |
+
3,sister city,0.348627,0.672424,0.106275,0.013939,100,99.0
|
| 633 |
+
3,find a series,0.349412,0.716364,0.102353,0.015152,101,99.0
|
| 634 |
+
3,should feel free,0.33451,0.733636,0.13451,0.013939,102,99.0
|
| 635 |
+
3,agreement,0.35098,0.749394,0.090196,0.014545,103,98.0
|
| 636 |
+
3,agreement or,0.336863,0.766061,0.135686,0.013939,104,99.0
|
| 637 |
+
3,Director at,0.353333,0.781818,0.089412,0.014848,105,98.0
|
| 638 |
+
3,foreign-language,0.431373,0.131818,0.142745,0.013636,106,97.0
|
| 639 |
+
3,documents such,0.434902,0.17697,0.138824,0.014242,107,99.0
|
| 640 |
+
3,their entirety.,0.448627,0.193636,0.108235,0.014242,108,99.0
|
| 641 |
+
3,some,0.467843,0.222727,0.046667,0.013939,109,99.0
|
| 642 |
+
3,promote peace,0.42902,0.239091,0.126667,0.014242,110,99.0
|
| 643 |
+
3,and/or other,0.447451,0.284545,0.11451,0.014242,111,98.0
|
| 644 |
+
3,document.,0.44902,0.301212,0.083137,0.013939,112,99.0
|
| 645 |
+
3,and should,0.450588,0.317576,0.088627,0.013939,113,99.0
|
| 646 |
+
3,has signed.,0.467059,0.346364,0.094118,0.014848,114,98.0
|
| 647 |
+
3,and they may,0.445882,0.362727,0.113725,0.014545,115,99.0
|
| 648 |
+
3,please contact,0.443137,0.378485,0.121569,0.015758,116,99.0
|
| 649 |
+
3,not have copies,0.447451,0.395758,0.132157,0.014242,117,98.0
|
| 650 |
+
3,top elected,0.452157,0.424848,0.094902,0.014545,118,98.0
|
| 651 |
+
3,"town clerk,",0.470196,0.453939,0.093333,0.015455,119,98.0
|
| 652 |
+
3,one putting,0.447059,0.470606,0.095686,0.014545,120,98.0
|
| 653 |
+
3,an agreement,0.470196,0.487576,0.119608,0.013636,121,98.0
|
| 654 |
+
3,during a,0.479216,0.516364,0.070588,0.014242,122,98.0
|
| 655 |
+
3,receive,0.493333,0.533333,0.062745,0.013333,123,99.0
|
| 656 |
+
3,agreement to,0.428235,0.578485,0.112549,0.013636,124,98.0
|
| 657 |
+
3,the relationship,0.468235,0.595152,0.128627,0.013333,125,99.0
|
| 658 |
+
3,Directory.,0.493333,0.610303,0.081961,0.015455,126,98.0
|
| 659 |
+
3,program is,0.420392,0.639091,0.09451,0.015152,127,99.0
|
| 660 |
+
3,"review period,",0.438039,0.655152,0.120392,0.015455,128,98.0
|
| 661 |
+
3,agreement. Check,0.454902,0.672424,0.12549,0.013939,129,99.0
|
| 662 |
+
3,of partnership,0.451765,0.716364,0.124706,0.015152,130,99.0
|
| 663 |
+
3,to use some,0.46902,0.733636,0.101569,0.013939,131,99.0
|
| 664 |
+
3,your own and,0.441176,0.749394,0.112549,0.014545,132,98.0
|
| 665 |
+
3,want advice,0.472549,0.766061,0.076471,0.013939,133,99.0
|
| 666 |
+
3,version to,0.574118,0.131818,0.08549,0.013636,134,97.0
|
| 667 |
+
3,as these,0.573725,0.17697,0.073333,0.014242,135,99.0
|
| 668 |
+
3,acknowledgement,0.51451,0.222727,0.148627,0.013939,136,99.0
|
| 669 |
+
3,through mutual,0.555686,0.239091,0.126667,0.014242,137,99.0
|
| 670 |
+
3,embellishments,0.561961,0.284545,0.117255,0.014242,138,98.0
|
| 671 |
+
3,Sister city,0.532157,0.301212,0.086667,0.013939,139,99.0
|
| 672 |
+
3,reflect their,0.539216,0.317576,0.103137,0.013939,140,99.0
|
| 673 |
+
3,These,0.561176,0.346364,0.05451,0.014848,141,98.0
|
| 674 |
+
3,be in an easily,0.559608,0.362727,0.118824,0.014545,142,99.0
|
| 675 |
+
3,Sister Cities,0.564706,0.378485,0.102353,0.015758,143,99.0
|
| 676 |
+
3,of all,0.579608,0.395758,0.043137,0.014242,144,98.0
|
| 677 |
+
3,official of,0.547059,0.424848,0.119608,0.014545,145,98.0
|
| 678 |
+
3,et al. to make,0.563529,0.453939,0.111373,0.015455,146,98.0
|
| 679 |
+
3,his or her name,0.542745,0.470606,0.13451,0.014545,147,98.0
|
| 680 |
+
3,which will,0.589804,0.487576,0.08,0.013636,148,98.0
|
| 681 |
+
3,formal ceremony,0.549804,0.516364,0.142745,0.014242,149,98.0
|
| 682 |
+
3,a signed set,0.556078,0.533333,0.103137,0.013333,150,99.0
|
| 683 |
+
3,Sister Cities,0.540784,0.578485,0.100784,0.013636,151,98.0
|
| 684 |
+
3,in the,0.596863,0.595152,0.048235,0.013333,152,99.0
|
| 685 |
+
3,independent and,0.514902,0.639091,0.136078,0.015152,153,99.0
|
| 686 |
+
3,sustainability/funding,0.558431,0.655152,0.183529,0.015455,154,98.0
|
| 687 |
+
3,with your,0.580392,0.672424,0.078431,0.013939,155,99.0
|
| 688 |
+
3,agreements,0.576471,0.716364,0.1,0.015152,156,99.0
|
| 689 |
+
3,of the,0.570588,0.733636,0.05451,0.013939,157,99.0
|
| 690 |
+
3,be creative,0.553725,0.749394,0.096471,0.014545,158,98.0
|
| 691 |
+
3,you can always,0.54902,0.766061,0.12549,0.013939,159,99.0
|
| 692 |
+
3,make sure it,0.659608,0.131818,0.102353,0.013636,160,97.0
|
| 693 |
+
3,partnership,0.647059,0.17697,0.099608,0.014242,161,99.0
|
| 694 |
+
3,of the founding,0.663137,0.222727,0.128627,0.013939,162,99.0
|
| 695 |
+
3,"respect,",0.682353,0.239091,0.072549,0.014242,163,99.0
|
| 696 |
+
3,such as,0.679216,0.284545,0.066275,0.014242,164,98.0
|
| 697 |
+
3,agreements are,0.618824,0.301212,0.132941,0.013939,165,99.0
|
| 698 |
+
3,historical,0.642353,0.317576,0.078039,0.013939,166,99.0
|
| 699 |
+
3,agreements may,0.615686,0.346364,0.137647,0.014848,167,98.0
|
| 700 |
+
3,replicable,0.678431,0.362727,0.082745,0.014545,168,99.0
|
| 701 |
+
3,"International,",0.667059,0.378485,0.114902,0.015758,169,99.0
|
| 702 |
+
3,partnership,0.622745,0.395758,0.101961,0.014242,170,98.0
|
| 703 |
+
3,both communities.,0.666667,0.424848,0.112941,0.014545,171,98.0
|
| 704 |
+
3,sure that,0.674902,0.453939,0.079608,0.015455,172,98.0
|
| 705 |
+
3,on the,0.677255,0.470606,0.055294,0.014545,173,98.0
|
| 706 |
+
3,never be,0.669804,0.487576,0.080392,0.013636,174,98.0
|
| 707 |
+
3,recognizing,0.692549,0.516364,0.098039,0.014242,175,98.0
|
| 708 |
+
3,of the official,0.659216,0.533333,0.116471,0.013333,176,99.0
|
| 709 |
+
3,International.,0.641569,0.578485,0.11098,0.013636,177,98.0
|
| 710 |
+
3,City Directory,0.645098,0.595152,0.116863,0.013333,178,99.0
|
| 711 |
+
3,can impose,0.65098,0.639091,0.09451,0.015152,179,99.0
|
| 712 |
+
3,local program,0.658824,0.672424,0.115294,0.013939,180,99.0
|
| 713 |
+
3,to give you,0.676471,0.716364,0.09098,0.015152,181,99.0
|
| 714 |
+
3,formatting and,0.625098,0.733636,0.125882,0.013939,182,99.0
|
| 715 |
+
3,with what you,0.650196,0.749394,0.111765,0.014545,183,98.0
|
| 716 |
+
3,solicit,0.67451,0.766061,0.05098,0.013939,184,99.0
|
| 717 |
+
3,or contacting,0.64902,0.781818,0.110196,0.014848,185,98.0
|
| 718 |
+
3,mirrors what,0.761961,0.131818,0.104314,0.013636,186,97.0
|
| 719 |
+
3,agreements,0.746667,0.17697,0.096863,0.014242,187,99.0
|
| 720 |
+
3,principles,0.791765,0.222727,0.079608,0.013939,188,99.0
|
| 721 |
+
3,"understanding,",0.754902,0.239091,0.120784,0.014242,189,99.0
|
| 722 |
+
3,city seals or,0.74549,0.284545,0.098431,0.014242,190,98.0
|
| 723 |
+
3,often posted,0.751765,0.301212,0.107059,0.013939,191,99.0
|
| 724 |
+
3,importance,0.720392,0.317576,0.087451,0.013939,192,99.0
|
| 725 |
+
3,give you an,0.753333,0.346364,0.095686,0.014848,193,98.0
|
| 726 |
+
3,format. If,0.761176,0.362727,0.083137,0.014545,194,99.0
|
| 727 |
+
3,we may,0.781961,0.378485,0.057647,0.015758,195,99.0
|
| 728 |
+
3,agreements.,0.724706,0.395758,0.099216,0.014242,196,98.0
|
| 729 |
+
3,the agreement,0.75451,0.453939,0.118039,0.015455,197,98.0
|
| 730 |
+
3,"paper, and you",0.732549,0.470606,0.121961,0.014545,198,98.0
|
| 731 |
+
3,signed.,0.750196,0.487576,0.056863,0.013636,199,98.0
|
| 732 |
+
3,the,0.790588,0.516364,0.024706,0.014242,200,98.0
|
| 733 |
+
3,documents,0.775686,0.533333,0.085882,0.013333,201,99.0
|
| 734 |
+
3,After we,0.752549,0.578485,0.070588,0.013636,202,98.0
|
| 735 |
+
3,and make,0.761961,0.595152,0.08,0.013333,203,99.0
|
| 736 |
+
3,requirements,0.74549,0.639091,0.109804,0.015152,204,99.0
|
| 737 |
+
3,"plan, among",0.741961,0.655152,0.092941,0.015455,205,98.0
|
| 738 |
+
3,or mayor's,0.774118,0.672424,0.086667,0.013939,206,99.0
|
| 739 |
+
3,an idea of,0.767451,0.716364,0.080392,0.015152,207,99.0
|
| 740 |
+
3,"language, we",0.75098,0.733636,0.101176,0.013939,208,99.0
|
| 741 |
+
3,produce.,0.761961,0.749394,0.094118,0.014545,209,98.0
|
| 742 |
+
3,feedback by,0.72549,0.766061,0.100392,0.013939,210,99.0
|
| 743 |
+
3,us at (202),0.759216,0.781818,0.089412,0.014848,211,98.0
|
| 744 |
+
3,Toolkit,0.82902,0.069394,0.132941,0.031515,212,99.0
|
| 745 |
+
3,at,0.858824,0.301212,0.014902,0.013939,213,99.0
|
| 746 |
+
3,idea,0.84902,0.346364,0.032941,0.014848,214,98.0
|
| 747 |
+
3,you,0.844314,0.362727,0.028627,0.014545,215,99.0
|
| 748 |
+
3,sure,0.841961,0.595152,0.038039,0.013333,216,99.0
|
| 749 |
+
3,If,0.856078,0.749394,0.007451,0.014545,217,98.0
|
| 750 |
+
4,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,98.0
|
| 751 |
+
4,ESTABLISHMENT,0.549412,0.371818,0.207451,0.015758,2,98.0
|
| 752 |
+
4,IN TERN ATION,0.17098,0.066667,0.198824,0.009091,3,91.0
|
| 753 |
+
4,Connect globally. Thrive,0.167451,0.084848,0.191373,0.016364,4,99.0
|
| 754 |
+
4,AN,0.26,0.371818,0.037255,0.015758,5,98.0
|
| 755 |
+
4,The Sister City,0.221176,0.523333,0.114118,0.014242,6,99.0
|
| 756 |
+
4,By the President,0.221569,0.537273,0.123922,0.014545,7,99.0
|
| 757 |
+
4,Friendship and,0.223137,0.552424,0.114902,0.012727,8,99.0
|
| 758 |
+
4,Direct personal,0.222353,0.565758,0.115686,0.013939,9,99.0
|
| 759 |
+
4,In order to,0.222745,0.59303,0.087451,0.012424,10,99.0
|
| 760 |
+
4,Communities Friendship and by,0.222353,0.608485,0.121176,0.025455,11,98.0
|
| 761 |
+
4,Abu Dhabi and,0.222745,0.649091,0.111765,0.010606,12,99.0
|
| 762 |
+
4,"medicine, and",0.222745,0.66303,0.101176,0.011515,13,99.0
|
| 763 |
+
4,hereby I proclaim,0.222745,0.676667,0.116078,0.012727,14,97.0
|
| 764 |
+
4,the date of Houston,0.222745,0.69,0.139216,0.011818,15,98.0
|
| 765 |
+
4,relationship,0.222745,0.704848,0.083137,0.011515,16,99.0
|
| 766 |
+
4,Signed on this,0.222745,0.731818,0.109804,0.011818,17,99.0
|
| 767 |
+
4,"Languages, both",0.222353,0.746061,0.124314,0.012121,18,99.0
|
| 768 |
+
4,Sheikh,0.245882,0.806667,0.064314,0.010909,19,99.0
|
| 769 |
+
4,Chairman,0.247059,0.823636,0.087451,0.010909,20,99.0
|
| 770 |
+
4,A L,0.369804,0.066667,0.038824,0.009091,21,91.0
|
| 771 |
+
4,locally.,0.358824,0.084848,0.051765,0.016364,22,99.0
|
| 772 |
+
4,a,0.377647,0.291515,0.046275,0.018788,23,23.0
|
| 773 |
+
4,ABU DHABI,0.375686,0.315455,0.082353,0.018182,24,99.0
|
| 774 |
+
4,AGREEMENT,0.297255,0.371818,0.147451,0.015758,25,98.0
|
| 775 |
+
4,SISTER,0.337647,0.392424,0.088627,0.013939,26,99.0
|
| 776 |
+
4,THE CITY,0.336078,0.430303,0.116863,0.016667,27,98.0
|
| 777 |
+
4,"HOUSTON,",0.385098,0.469697,0.127451,0.015455,28,96.0
|
| 778 |
+
4,"Program,",0.335294,0.523333,0.072941,0.014242,29,99.0
|
| 779 |
+
4,of the United,0.34549,0.537273,0.097255,0.014545,30,99.0
|
| 780 |
+
4,understanding,0.338039,0.552424,0.106667,0.012727,31,99.0
|
| 781 |
+
4,contact; and,0.338039,0.565758,0.091765,0.013939,32,99.0
|
| 782 |
+
4,"foster those goals,",0.310196,0.59303,0.151765,0.012424,33,99.0
|
| 783 |
+
4,"goodwill, agree",0.341569,0.606061,0.109412,0.013939,34,98.0
|
| 784 |
+
4,"exploring education,",0.343529,0.620303,0.15098,0.013636,35,98.0
|
| 785 |
+
4,"Houston, sharing",0.338039,0.648788,0.128235,0.011818,36,98.0
|
| 786 |
+
4,the desire to promote,0.327843,0.66303,0.153725,0.011212,37,98.0
|
| 787 |
+
4,themselves Sister,0.343922,0.677273,0.123529,0.009697,38,99.0
|
| 788 |
+
4,City effective. Council,0.368627,0.690909,0.092157,0.024242,39,99.0
|
| 789 |
+
4,became,0.311765,0.705758,0.05451,0.009091,40,99.0
|
| 790 |
+
4,26 of October,0.332549,0.731818,0.103137,0.011818,41,98.0
|
| 791 |
+
4,text being equally,0.346667,0.746061,0.132549,0.012121,42,99.0
|
| 792 |
+
4,Mohammed bin,0.318039,0.806667,0.143922,0.01,43,99.0
|
| 793 |
+
4,of Abu Dhabi,0.343137,0.823636,0.122353,0.010303,44,99.0
|
| 794 |
+
4,&Town Planning,0.326275,0.841818,0.152549,0.010606,45,98.0
|
| 795 |
+
4,4,0.502745,0.256364,0.085882,0.024848,46,33.0
|
| 796 |
+
4,Labiig G,0.427059,0.29,0.137647,0.023636,47,66.0
|
| 797 |
+
4,MUNICIPALITY,0.460392,0.316364,0.103137,0.017273,48,99.0
|
| 798 |
+
4,FOR THE,0.444706,0.371818,0.104706,0.015758,49,98.0
|
| 799 |
+
4,CITIES,0.426275,0.392424,0.08902,0.013939,50,99.0
|
| 800 |
+
4,BETWEEN,0.45451,0.412121,0.111373,0.013333,51,99.0
|
| 801 |
+
4,OF ABU,0.452941,0.430303,0.087451,0.016667,52,98.0
|
| 802 |
+
4,AND,0.488627,0.451818,0.049412,0.012727,53,99.0
|
| 803 |
+
4,TEXAS,0.512549,0.469697,0.081569,0.015455,54,96.0
|
| 804 |
+
4,administered by Sister,0.408235,0.523333,0.172157,0.014242,55,99.0
|
| 805 |
+
4,States of America,0.442745,0.537273,0.135294,0.014545,56,99.0
|
| 806 |
+
4,between the United,0.446667,0.552121,0.141176,0.011515,57,99.0
|
| 807 |
+
4,the people of,0.461961,0.59303,0.103922,0.012424,58,99.0
|
| 808 |
+
4,to collaborate,0.45098,0.606061,0.102745,0.01303,59,98.0
|
| 809 |
+
4,economic,0.49451,0.620303,0.074118,0.013636,60,98.0
|
| 810 |
+
4,common,0.481176,0.648788,0.061569,0.010606,61,99.0
|
| 811 |
+
4,mutual,0.487059,0.663333,0.057255,0.009394,62,99.0
|
| 812 |
+
4,resolution Cities,0.466667,0.67697,0.077255,0.024545,63,99.0
|
| 813 |
+
4,"2002, in duplicate",0.435686,0.732121,0.132549,0.010303,64,96.0
|
| 814 |
+
4,authentic.,0.479216,0.746061,0.070196,0.012121,65,99.0
|
| 815 |
+
4,Butti Al,0.468627,0.805455,0.076471,0.011212,66,98.0
|
| 816 |
+
4,Municipality,0.471373,0.823333,0.115686,0.011515,67,99.0
|
| 817 |
+
4,gi,0.560784,0.288788,0.03098,0.023636,68,49.0
|
| 818 |
+
4,& TOWN PLANNING,0.567451,0.316061,0.138431,0.018788,69,79.0
|
| 819 |
+
4,RELATIONSHIP,0.515294,0.392424,0.165098,0.013939,70,99.0
|
| 820 |
+
4,DHABI ( U.,0.540392,0.430303,0.126667,0.016667,71,98.0
|
| 821 |
+
4,( U.S.A),0.594118,0.469697,0.089804,0.015455,72,96.0
|
| 822 |
+
4,Cities,0.580392,0.523333,0.044706,0.014242,73,99.0
|
| 823 |
+
4,in 1956 to,0.578039,0.537273,0.071373,0.014545,74,99.0
|
| 824 |
+
4,States and,0.587843,0.552121,0.081961,0.011515,75,99.0
|
| 825 |
+
4,Abu Dhabi and,0.565882,0.59303,0.088627,0.012424,76,99.0
|
| 826 |
+
4,for the mutual,0.553725,0.606061,0.105882,0.01303,77,98.0
|
| 827 |
+
4,and cultural,0.568627,0.620303,0.093725,0.013636,78,98.0
|
| 828 |
+
4,interest in energy,0.547451,0.648485,0.134118,0.011515,79,98.0
|
| 829 |
+
4,understanding,0.541176,0.662121,0.110196,0.012121,80,98.0
|
| 830 |
+
4,estatblishing the,0.543922,0.689394,0.121961,0.012121,81,99.0
|
| 831 |
+
4,in the Arabic,0.568235,0.732121,0.1,0.010303,82,96.0
|
| 832 |
+
4,Hamed,0.545098,0.805455,0.066667,0.011212,83,98.0
|
| 833 |
+
4,OF,0.756863,0.371818,0.030196,0.015758,84,98.0
|
| 834 |
+
4,A.E),0.667059,0.430303,0.047059,0.016667,85,98.0
|
| 835 |
+
4,"International, was",0.625098,0.523333,0.139216,0.014242,86,99.0
|
| 836 |
+
4,encourage greater,0.649412,0.537273,0.136863,0.014545,87,99.0
|
| 837 |
+
4,other nations,0.669804,0.552121,0.102745,0.011515,88,99.0
|
| 838 |
+
4,"Houston, in a",0.65451,0.59303,0.09451,0.012424,89,99.0
|
| 839 |
+
4,benefit of their,0.659608,0.606061,0.11098,0.01303,90,98.0
|
| 840 |
+
4,opportunities.,0.662353,0.620303,0.101569,0.013636,91,98.0
|
| 841 |
+
4,", technology",0.681569,0.648485,0.09098,0.011515,92,98.0
|
| 842 |
+
4,among our citizens,0.651373,0.662121,0.143137,0.012121,93,98.0
|
| 843 |
+
4,Sister City,0.665882,0.689394,0.075294,0.012121,94,99.0
|
| 844 |
+
4,and English,0.668235,0.732121,0.085098,0.010303,95,96.0
|
| 845 |
+
4,fe,0.700784,0.752424,0.075686,0.045455,96,57.0
|
| 846 |
+
4,Lee,0.728627,0.804545,0.038824,0.012121,97,99.0
|
| 847 |
+
4,Mayor,0.70549,0.821818,0.06,0.013939,98,96.0
|
| 848 |
+
4,initiated,0.764314,0.523333,0.056863,0.014242,99,99.0
|
| 849 |
+
4,through,0.772549,0.552121,0.058039,0.011515,100,99.0
|
| 850 |
+
4,gesture of,0.74902,0.59303,0.077647,0.012424,101,99.0
|
| 851 |
+
4,and,0.772549,0.648485,0.024314,0.011515,102,98.0
|
| 852 |
+
4,do,0.79451,0.662121,0.017647,0.012121,103,98.0
|
| 853 |
+
4,P.Brown,0.768627,0.804848,0.081176,0.012424,104,99.0
|
| 854 |
+
4,ofI Houston,0.76549,0.821818,0.105882,0.013939,105,98.0
|
| 855 |
+
4,Toolkit,0.82902,0.069091,0.133333,0.032424,106,99.0
|
| 856 |
+
5,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,99.0
|
| 857 |
+
5,IN TERNAT,0.170588,0.066667,0.152549,0.009091,2,93.0
|
| 858 |
+
5,Connect globally. Thrive,0.167451,0.084848,0.191373,0.016364,3,99.0
|
| 859 |
+
5,THE,0.267451,0.353939,0.044706,0.014545,4,98.0
|
| 860 |
+
5,The Sister City,0.2,0.402121,0.122745,0.014242,5,99.0
|
| 861 |
+
5,beneficial solutions,0.202353,0.419091,0.154118,0.010303,6,99.0
|
| 862 |
+
5,"Consequently, the",0.202745,0.434848,0.143137,0.011515,7,95.0
|
| 863 |
+
5,important in their,0.201176,0.448788,0.139216,0.014242,8,99.0
|
| 864 |
+
5,Encourage,0.229412,0.480303,0.088627,0.015152,9,99.0
|
| 865 |
+
5,that they can,0.230588,0.49697,0.103529,0.013636,10,99.0
|
| 866 |
+
5,Supporta and,0.230196,0.513333,0.098039,0.015455,11,98.0
|
| 867 |
+
5,community,0.229412,0.527576,0.095294,0.015152,12,99.0
|
| 868 |
+
5,medium for,0.229804,0.543636,0.09451,0.012727,13,99.0
|
| 869 |
+
5,Generate an,0.23098,0.562121,0.094902,0.010606,14,99.0
|
| 870 |
+
5,"by serving themes, as",0.22902,0.576667,0.132157,0.028182,15,99.0
|
| 871 |
+
5,Identify,0.229804,0.593333,0.06549,0.012727,16,99.0
|
| 872 |
+
5,nurture the i,0.228627,0.609091,0.096471,0.011818,17,92.0
|
| 873 |
+
5,Promote the cities; key,0.228235,0.623333,0.106667,0.031515,18,98.0
|
| 874 |
+
5,Provide financial,0.228627,0.655455,0.137647,0.014242,19,98.0
|
| 875 |
+
5,aims of the,0.228235,0.670303,0.099216,0.015758,20,99.0
|
| 876 |
+
5,With the above,0.198824,0.700303,0.122353,0.016364,21,98.0
|
| 877 |
+
5,London soleinnly,0.198431,0.716667,0.141176,0.015455,22,98.0
|
| 878 |
+
5,protocol of this,0.197647,0.731818,0.130588,0.017273,23,99.0
|
| 879 |
+
5,This,0.308627,0.777576,0.037255,0.018485,24,98.0
|
| 880 |
+
5,Rudolph,0.256078,0.874848,0.08,0.018485,25,98.0
|
| 881 |
+
5,New,0.284314,0.906061,0.042353,0.017576,26,99.0
|
| 882 |
+
5,IONAL,0.323137,0.066667,0.08549,0.009091,27,93.0
|
| 883 |
+
5,locally.,0.358824,0.084848,0.051765,0.016364,28,99.0
|
| 884 |
+
5,NEW YORK,0.312157,0.353939,0.114118,0.014545,29,98.0
|
| 885 |
+
5,partnership,0.322745,0.402121,0.092549,0.014242,30,99.0
|
| 886 |
+
5,to common,0.361176,0.419697,0.088627,0.008788,31,99.0
|
| 887 |
+
5,Sister City,0.35098,0.433636,0.082745,0.011515,32,99.0
|
| 888 |
+
5,network of,0.347451,0.449394,0.083922,0.010303,33,99.0
|
| 889 |
+
5,and publicize,0.322745,0.481515,0.103922,0.012727,34,99.0
|
| 890 |
+
5,flourish to,0.338431,0.496667,0.079608,0.010606,35,99.0
|
| 891 |
+
5,promote the,0.33098,0.51303,0.097647,0.013939,36,99.0
|
| 892 |
+
5,programs learning from to,0.326275,0.530606,0.108627,0.027576,37,99.0
|
| 893 |
+
5,improvement,0.330196,0.559697,0.105882,0.013939,38,99.0
|
| 894 |
+
5,a conduit of,0.338431,0.576364,0.098824,0.010606,39,99.0
|
| 895 |
+
5,common,0.361176,0.590303,0.07098,0.014545,40,98.0
|
| 896 |
+
5,increasingly,0.317647,0.606667,0.103137,0.014545,41,99.0
|
| 897 |
+
5,mayoral,0.334902,0.63697,0.069804,0.017879,42,97.0
|
| 898 |
+
5,or in kind,0.368235,0.655152,0.078431,0.012727,43,99.0
|
| 899 |
+
5,Sister City,0.327451,0.670303,0.087843,0.015758,44,99.0
|
| 900 |
+
5,purposes in,0.321176,0.700303,0.102745,0.016364,45,98.0
|
| 901 |
+
5,confirm that,0.339608,0.716667,0.101176,0.015455,46,98.0
|
| 902 |
+
5,Memorandum,0.328235,0.731818,0.110588,0.017273,47,99.0
|
| 903 |
+
5,agreement,0.345882,0.777576,0.088627,0.018485,48,98.0
|
| 904 |
+
5,huliaui,0.315294,0.807576,0.160392,0.073939,49,88.0
|
| 905 |
+
5,W. Giuliani,0.336078,0.874848,0.089412,0.018485,50,98.0
|
| 906 |
+
5,Mayor,0.30902,0.892121,0.06,0.015758,51,99.0
|
| 907 |
+
5,York City,0.326667,0.906061,0.085882,0.017576,52,99.0
|
| 908 |
+
5,The City,0.437255,0.259697,0.098824,0.013333,53,85.0
|
| 909 |
+
5,OFFiCe,0.450588,0.276061,0.076471,0.011515,54,73.0
|
| 910 |
+
5,"NEW YORK,",0.46,0.291212,0.105882,0.012121,55,89.0
|
| 911 |
+
5,CITY-LONDON,0.426275,0.353939,0.147059,0.014545,56,98.0
|
| 912 |
+
5,Memorandum,0.420784,0.369394,0.121961,0.014545,57,99.0
|
| 913 |
+
5,between New,0.420784,0.401515,0.109412,0.01303,58,97.0
|
| 914 |
+
5,challenges,0.456078,0.417879,0.082745,0.011515,59,99.0
|
| 915 |
+
5,relationship,0.439608,0.43303,0.09098,0.012121,60,99.0
|
| 916 |
+
5,global,0.437255,0.449091,0.049412,0.011818,61,99.0
|
| 917 |
+
5,existing,0.431373,0.48,0.067059,0.013939,62,99.0
|
| 918 |
+
5,benefit a wider,0.421569,0.496667,0.120784,0.010303,63,99.0
|
| 919 |
+
5,development,0.430588,0.512121,0.10549,0.013939,64,99.0
|
| 920 |
+
5,encourage both,0.423922,0.528485,0.122353,0.013636,65,99.0
|
| 921 |
+
5,one another;,0.44,0.542727,0.100392,0.012727,66,99.0
|
| 922 |
+
5,of the operation,0.439608,0.560606,0.126667,0.011515,67,99.0
|
| 923 |
+
5,information:,0.437255,0.575758,0.099216,0.010303,68,99.0
|
| 924 |
+
5,"to both, that",0.432157,0.590303,0.101176,0.014545,69,98.0
|
| 925 |
+
5,"powerful financial,",0.420784,0.606667,0.154902,0.014545,70,99.0
|
| 926 |
+
5,priorities relevant,0.404706,0.63697,0.16,0.017879,71,97.0
|
| 927 |
+
5,s support to,0.446667,0.655152,0.089804,0.014242,72,99.0
|
| 928 |
+
5,partnership;,0.40902,0.671515,0.095686,0.015152,73,99.0
|
| 929 |
+
5,"mind, the Mayor",0.423922,0.700303,0.132157,0.016364,74,98.0
|
| 930 |
+
5,these two,0.440784,0.716667,0.08,0.015455,75,98.0
|
| 931 |
+
5,of Understanding.,0.438824,0.731818,0.144314,0.017273,76,99.0
|
| 932 |
+
5,will go into,0.43451,0.777576,0.094118,0.018485,77,98.0
|
| 933 |
+
5,Signed in,0.456078,0.794848,0.078824,0.014545,78,97.0
|
| 934 |
+
5,OF New,0.536078,0.259697,0.089804,0.013333,79,85.0
|
| 935 |
+
5,OF THE MayOr,0.527059,0.276061,0.144706,0.011515,80,73.0
|
| 936 |
+
5,N.Y. 1OOO7,0.565882,0.291212,0.091765,0.012121,81,89.0
|
| 937 |
+
5,SISTER,0.573333,0.353939,0.076078,0.014545,82,98.0
|
| 938 |
+
5,of Understanding,0.542745,0.369394,0.152941,0.014545,83,99.0
|
| 939 |
+
5,York City and,0.534902,0.400606,0.114118,0.013333,84,99.0
|
| 940 |
+
5,for these two,0.545882,0.417273,0.106275,0.011818,85,99.0
|
| 941 |
+
5,between the,0.537647,0.43303,0.102353,0.010303,86,96.0
|
| 942 |
+
5,"partnerships, as it",0.491765,0.448182,0.138039,0.012121,87,98.0
|
| 943 |
+
5,exchanges between,0.496863,0.478788,0.166667,0.014848,88,99.0
|
| 944 |
+
5,cross-section,0.546275,0.497273,0.105098,0.009091,89,99.0
|
| 945 |
+
5,of new social.,0.536863,0.512727,0.11098,0.011515,90,97.0
|
| 946 |
+
5,cities citizens,0.552157,0.528788,0.112157,0.009091,91,99.0
|
| 947 |
+
5,of the cities',0.56549,0.559697,0.103529,0.012727,92,97.0
|
| 948 |
+
5,can generate,0.533333,0.590303,0.105882,0.014545,93,98.0
|
| 949 |
+
5,social and,0.575686,0.606667,0.085098,0.014545,94,99.0
|
| 950 |
+
5,to both,0.564706,0.63697,0.054118,0.017879,95,97.0
|
| 951 |
+
5,community-led,0.536471,0.655758,0.119608,0.013636,96,99.0
|
| 952 |
+
5,of the City,0.556078,0.700303,0.09098,0.016364,97,98.0
|
| 953 |
+
5,cities are u united,0.520784,0.716667,0.129804,0.015455,98,98.0
|
| 954 |
+
5,effect from the,0.528627,0.777576,0.121569,0.018485,99,98.0
|
| 955 |
+
5,March of 2001,0.534902,0.794848,0.116471,0.014545,100,97.0
|
| 956 |
+
5,York,0.625882,0.259697,0.056078,0.013333,101,85.0
|
| 957 |
+
5,CITY,0.649412,0.353939,0.058039,0.014545,102,98.0
|
| 958 |
+
5,London will,0.64902,0.400606,0.100784,0.013333,103,99.0
|
| 959 |
+
5,great,0.652157,0.417273,0.04902,0.011818,104,99.0
|
| 960 |
+
5,two will be one,0.64,0.43303,0.120392,0.010303,105,96.0
|
| 961 |
+
5,strives to:,0.629804,0.448182,0.079608,0.012121,106,98.0
|
| 962 |
+
5,London,0.663529,0.478788,0.088235,0.014848,107,99.0
|
| 963 |
+
5,of the citizens,0.654118,0.495152,0.113725,0.012121,108,99.0
|
| 964 |
+
5,"economic,",0.652941,0.511818,0.085098,0.012727,109,99.0
|
| 965 |
+
5,to share,0.66902,0.527273,0.069412,0.013636,110,99.0
|
| 966 |
+
5,various,0.66902,0.559697,0.066667,0.012727,111,97.0
|
| 967 |
+
5,new initiatives,0.639216,0.590303,0.118431,0.014545,112,98.0
|
| 968 |
+
5,cultural,0.660784,0.606667,0.066275,0.014545,113,99.0
|
| 969 |
+
5,London and,0.618824,0.63697,0.12902,0.017879,114,97.0
|
| 970 |
+
5,programs,0.656078,0.655758,0.083922,0.013636,115,99.0
|
| 971 |
+
5,of New York,0.647059,0.700303,0.107843,0.016364,116,98.0
|
| 972 |
+
5,by an official,0.650588,0.717273,0.111373,0.013636,117,99.0
|
| 973 |
+
5,date of,0.650196,0.777576,0.062353,0.018485,118,98.0
|
| 974 |
+
5,Ken,0.668627,0.873636,0.037647,0.018182,119,98.0
|
| 975 |
+
5,Mayor,0.703922,0.890606,0.06,0.01697,120,99.0
|
| 976 |
+
5,London,0.698039,0.906364,0.069412,0.016061,121,99.0
|
| 977 |
+
5,PARTNERSHIP,0.707451,0.353939,0.143137,0.014545,122,98.0
|
| 978 |
+
5,foster mutually,0.749804,0.400606,0.125098,0.013333,123,99.0
|
| 979 |
+
5,cosmopolitan entities,0.701176,0.417273,0.166275,0.011818,124,99.0
|
| 980 |
+
5,of the most,0.760392,0.43303,0.093333,0.010303,125,96.0
|
| 981 |
+
5,and New York,0.751765,0.478788,0.095686,0.014848,126,99.0
|
| 982 |
+
5,of both;,0.767843,0.495152,0.06549,0.012121,127,98.0
|
| 983 |
+
5,academic and,0.738039,0.511818,0.112157,0.012727,128,99.0
|
| 984 |
+
5,their experiences,0.738431,0.527273,0.139608,0.013636,129,99.0
|
| 985 |
+
5,government,0.735686,0.559697,0.1,0.012727,130,97.0
|
| 986 |
+
5,to further,0.757647,0.590303,0.08549,0.014545,131,98.0
|
| 987 |
+
5,relationships,0.727059,0.606667,0.10902,0.014545,132,99.0
|
| 988 |
+
5,New York City;,0.747843,0.63697,0.090196,0.017879,133,97.0
|
| 989 |
+
5,that advance,0.74,0.655758,0.106667,0.013636,134,99.0
|
| 990 |
+
5,and the,0.754902,0.700303,0.065098,0.016364,135,98.0
|
| 991 |
+
5,partnership,0.761961,0.717273,0.1,0.013636,136,99.0
|
| 992 |
+
5,signatures.,0.712549,0.777576,0.086275,0.018485,137,98.0
|
| 993 |
+
5,G,0.701569,0.82697,0.100784,0.038485,138,55.0
|
| 994 |
+
5,Livingstone,0.706275,0.873636,0.1,0.018182,139,98.0
|
| 995 |
+
5,Toolkit,0.82902,0.069091,0.133333,0.032424,140,99.0
|
| 996 |
+
5,City so,0.847451,0.478788,0.055294,0.014848,141,99.0
|
| 997 |
+
5,as a,0.878039,0.527273,0.031765,0.013636,142,99.0
|
| 998 |
+
5,agencies,0.835686,0.559697,0.07098,0.012727,143,97.0
|
| 999 |
+
5,and,0.843137,0.590303,0.028627,0.014545,144,98.0
|
| 1000 |
+
5,between,0.836078,0.606667,0.06549,0.014545,145,99.0
|
| 1001 |
+
5,the,0.846667,0.655758,0.024706,0.013636,146,99.0
|
| 1002 |
+
5,Mayor of,0.82,0.700303,0.076863,0.016364,147,98.0
|
| 1003 |
+
5,by the,0.861961,0.717273,0.048235,0.013636,148,99.0
|
| 1004 |
+
6,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,99.0
|
| 1005 |
+
6,?,0.072157,0.689091,0.383529,0.282121,2,33.0
|
| 1006 |
+
6,IN TE RN A T,0.17098,0.066667,0.146667,0.009091,3,88.0
|
| 1007 |
+
6,Connect globally.,0.167451,0.084848,0.13451,0.016364,4,99.0
|
| 1008 |
+
6,"California, In accordance USA,",0.216471,0.482121,0.138824,0.021515,5,98.0
|
| 1009 |
+
6,"purposes: that a ""Sister City",0.216078,0.502727,0.121176,0.024545,6,97.0
|
| 1010 |
+
6,the people (1) of Long to,0.218431,0.531515,0.114118,0.024242,7,97.0
|
| 1011 |
+
6,relations between (2) to,0.217255,0.561212,0.107843,0.025152,8,98.0
|
| 1012 |
+
6,"information professional, in",0.217647,0.583636,0.10549,0.023333,9,96.0
|
| 1013 |
+
6,efucational (3) activities to,0.218431,0.612424,0.129412,0.023939,10,98.0
|
| 1014 |
+
6,this document This Sister,0.217647,0.642424,0.11451,0.025152,11,97.0
|
| 1015 |
+
6,Mayor of San,0.218039,0.663333,0.089412,0.015758,12,96.0
|
| 1016 |
+
6,IO N A L,0.317647,0.066667,0.09098,0.009091,13,88.0
|
| 1017 |
+
6,Thrive locally.,0.301961,0.084848,0.108627,0.016364,14,99.0
|
| 1018 |
+
6,Pister,0.321176,0.308182,0.096863,0.022424,15,96.0
|
| 1019 |
+
6,City,0.378431,0.374545,0.058431,0.020303,16,99.0
|
| 1020 |
+
6,City of,0.321569,0.425758,0.102353,0.019091,17,98.0
|
| 1021 |
+
6,"Ecuador,",0.345882,0.447273,0.126667,0.014848,18,98.0
|
| 1022 |
+
6,and the with City the,0.322745,0.482121,0.100784,0.021515,19,98.0
|
| 1023 |
+
6,"Agreement""",0.337255,0.502727,0.081961,0.012121,20,97.0
|
| 1024 |
+
6,promote Beach and and the,0.320784,0.531515,0.098824,0.024242,21,97.0
|
| 1025 |
+
6,promote the fwo cities,0.324314,0.561212,0.083922,0.025152,22,97.0
|
| 1026 |
+
6,"technical, a wide youth, sariety and",0.300784,0.583636,0.124314,0.023333,23,96.0
|
| 1027 |
+
6,foster between and the,0.321176,0.612424,0.102745,0.023939,24,98.0
|
| 1028 |
+
6,City has been Agreement duly,0.331373,0.642424,0.1,0.025152,25,97.0
|
| 1029 |
+
6,"Pablo de Manta,",0.307451,0.663333,0.099608,0.015758,26,96.0
|
| 1030 |
+
6,Cit,0.39098,0.201515,0.098039,0.055455,27,89.0
|
| 1031 |
+
6,Lity,0.432157,0.30697,0.072549,0.033333,28,96.0
|
| 1032 |
+
6,between,0.464706,0.35,0.062745,0.013636,29,98.0
|
| 1033 |
+
6,of Long,0.436863,0.374545,0.114902,0.020303,30,99.0
|
| 1034 |
+
6,"California,",0.400392,0.396667,0.155686,0.016061,31,99.0
|
| 1035 |
+
6,San and,0.423922,0.412727,0.087451,0.032121,32,98.0
|
| 1036 |
+
6,South,0.472549,0.447273,0.083529,0.014848,33,98.0
|
| 1037 |
+
6,authorization of San Pablo and de,0.403922,0.482121,0.110196,0.021515,34,98.0
|
| 1038 |
+
6,between the two,0.419216,0.502727,0.102745,0.012121,35,97.0
|
| 1039 |
+
6,expand people the of effective San,0.402353,0.531515,0.127059,0.024242,36,97.0
|
| 1040 |
+
6,"international and their respective goodwill,",0.38549,0.561212,0.152549,0.025152,37,97.0
|
| 1041 |
+
6,"of other economic, endeavors;and",0.423922,0.583636,0.127451,0.023333,38,96.0
|
| 1042 |
+
6,"encourage two cities; charitable,",0.391373,0.612424,0.138824,0.023939,39,98.0
|
| 1043 |
+
6,executed shall be by officially the,0.401176,0.642424,0.135294,0.025152,40,97.0
|
| 1044 |
+
6,"Ecuador, South",0.407059,0.663333,0.098431,0.015758,41,96.0
|
| 1045 |
+
6,of Long,0.50902,0.210303,0.105098,0.057576,42,89.0
|
| 1046 |
+
6,Agreement,0.514902,0.308182,0.188627,0.032121,43,96.0
|
| 1047 |
+
6,the,0.527451,0.35,0.022353,0.013636,44,98.0
|
| 1048 |
+
6,Beach,0.551765,0.374545,0.082745,0.020303,45,99.0
|
| 1049 |
+
6,USA,0.556078,0.396667,0.056471,0.016061,46,99.0
|
| 1050 |
+
6,Pablo the de,0.484314,0.412727,0.126275,0.032121,47,98.0
|
| 1051 |
+
6,America,0.556078,0.447273,0.10902,0.014848,48,98.0
|
| 1052 |
+
6,"Manta, approval Ecuador, expressed",0.507451,0.482121,0.128235,0.021515,49,98.0
|
| 1053 |
+
6,cities is hereby,0.521961,0.502727,0.098431,0.012121,50,97.0
|
| 1054 |
+
6,Pablo and de mutually Manta; and,0.504706,0.531515,0.12,0.024242,51,97.0
|
| 1055 |
+
6,"natioms understanding, by the",0.532549,0.561212,0.108235,0.025152,52,97.0
|
| 1056 |
+
6,"social, cultural,",0.518039,0.583636,0.109412,0.012121,53,96.0
|
| 1057 |
+
6,"scientific, trade",0.530196,0.612424,0.121961,0.013939,54,98.0
|
| 1058 |
+
6,Mayor established of Long and,0.507059,0.642424,0.125098,0.025152,55,97.0
|
| 1059 |
+
6,America.,0.50549,0.663333,0.053333,0.015758,56,96.0
|
| 1060 |
+
6,Beperly,0.587059,0.735758,0.063137,0.015758,57,99.0
|
| 1061 |
+
6,"Mayor, City",0.542745,0.750909,0.096078,0.01303,58,98.0
|
| 1062 |
+
6,"Mayor, Ing. Jorge",0.504314,0.824545,0.098431,0.028485,59,97.0
|
| 1063 |
+
6,"Ecuador, City of",0.552157,0.840303,0.070588,0.025152,60,99.0
|
| 1064 |
+
6,Dated:,0.545882,0.883333,0.051765,0.011515,61,99.0
|
| 1065 |
+
6,alifornia,0.561569,0.257879,0.127451,0.028485,62,99.0
|
| 1066 |
+
6,Manta,0.610588,0.425758,0.081961,0.019091,63,98.0
|
| 1067 |
+
6,"South bry America, the City",0.615686,0.482121,0.103137,0.021515,64,97.0
|
| 1068 |
+
6,established for,0.620392,0.502727,0.1,0.012121,65,97.0
|
| 1069 |
+
6,beneficial cooperation,0.612549,0.531515,0.137647,0.012727,66,98.0
|
| 1070 |
+
6,exchange and expanded of,0.620784,0.561212,0.119216,0.025152,67,98.0
|
| 1071 |
+
6,"municipal,",0.627451,0.583636,0.08,0.012121,68,96.0
|
| 1072 |
+
6,"and commerce,",0.652157,0.612424,0.075686,0.013939,69,98.0
|
| 1073 |
+
6,"Beach, shall California, become",0.606667,0.642424,0.138039,0.025152,70,97.0
|
| 1074 |
+
6,Hiee,0.553333,0.697879,0.179216,0.037273,71,54.0
|
| 1075 |
+
6,bNeill,0.647059,0.735758,0.064706,0.011818,72,90.0
|
| 1076 |
+
6,of Long USA,0.643137,0.750909,0.066275,0.025455,73,98.0
|
| 1077 |
+
6,"California,",0.583137,0.766364,0.088627,0.011515,74,99.0
|
| 1078 |
+
6,Dulus,0.596471,0.792424,0.080784,0.027273,75,69.0
|
| 1079 |
+
6,O.Zambrano Pablo,0.603922,0.824545,0.107843,0.025152,76,98.0
|
| 1080 |
+
6,San South America,0.623922,0.839394,0.116078,0.025455,77,99.0
|
| 1081 |
+
6,September,0.600392,0.881818,0.107451,0.013939,78,97.0
|
| 1082 |
+
6,Beach,0.715686,0.210303,0.148235,0.048182,79,89.0
|
| 1083 |
+
6,"of it Long is declared Beach,",0.708627,0.482121,0.091373,0.021515,80,98.0
|
| 1084 |
+
6,the following,0.720392,0.502727,0.079608,0.012121,81,97.0
|
| 1085 |
+
6,between,0.750196,0.531515,0.050196,0.012727,82,98.0
|
| 1086 |
+
6,"people, business ideas, and",0.698824,0.561212,0.102353,0.025152,83,97.0
|
| 1087 |
+
6,"environmental,",0.707451,0.583636,0.092941,0.012121,84,96.0
|
| 1088 |
+
6,literary and,0.727843,0.612424,0.073333,0.013939,85,98.0
|
| 1089 |
+
6,"effective USA, and when the",0.712157,0.642424,0.08902,0.025152,86,97.0
|
| 1090 |
+
6,Beach,0.704314,0.750909,0.049804,0.012727,87,97.0
|
| 1091 |
+
6,Cedeño,0.707451,0.824242,0.063137,0.011212,88,96.0
|
| 1092 |
+
6,de Manta,0.706667,0.839091,0.076471,0.009394,89,99.0
|
| 1093 |
+
6,"19,2000",0.707843,0.881818,0.04,0.013939,90,97.0
|
| 1094 |
+
6,Toolkit,0.82902,0.069091,0.133333,0.032424,91,99.0
|
| 1095 |
+
7,SisterCities Partnership Agreement,0.167843,0.025758,0.793725,0.03697,1,99.0
|
| 1096 |
+
7,IN TE RN A,0.17098,0.066667,0.125882,0.009091,2,90.0
|
| 1097 |
+
7,Connect globally.,0.167843,0.085152,0.134118,0.015455,3,98.0
|
| 1098 |
+
7,adopted by,0.198431,0.211212,0.085882,0.017576,4,99.0
|
| 1099 |
+
7,and,0.195686,0.256364,0.03451,0.019091,5,99.0
|
| 1100 |
+
7,On this,0.252549,0.358182,0.06,0.021515,6,98.0
|
| 1101 |
+
7,the traditional,0.196471,0.375758,0.112549,0.019394,7,98.0
|
| 1102 |
+
7,aspiration to,0.198824,0.393939,0.105098,0.014848,8,98.0
|
| 1103 |
+
7,Richard M.,0.198431,0.409697,0.089804,0.015455,9,97.0
|
| 1104 |
+
7,of the City of,0.198824,0.426364,0.105098,0.014545,10,97.0
|
| 1105 |
+
7,sister cities,0.198431,0.441818,0.111765,0.015152,11,98.0
|
| 1106 |
+
7,The,0.25451,0.471212,0.034902,0.017576,12,98.0
|
| 1107 |
+
7,equality and,0.198824,0.487273,0.098431,0.017576,13,98.0
|
| 1108 |
+
7,broaden economic,0.199608,0.504848,0.139608,0.014848,14,98.0
|
| 1109 |
+
7,The two,0.253725,0.534545,0.062353,0.019697,15,98.0
|
| 1110 |
+
7,and trade,0.198824,0.553333,0.077647,0.014848,16,98.0
|
| 1111 |
+
7,In,0.253333,0.582121,0.023922,0.017576,17,99.0
|
| 1112 |
+
7,dance and other,0.199216,0.600606,0.125098,0.011818,18,96.0
|
| 1113 |
+
7,In,0.253333,0.628182,0.024314,0.017576,19,98.0
|
| 1114 |
+
7,within educational,0.199216,0.647273,0.137647,0.01303,20,98.0
|
| 1115 |
+
7,In,0.252549,0.675152,0.023529,0.018182,21,99.0
|
| 1116 |
+
7,"technology,",0.198039,0.69303,0.092941,0.015152,22,97.0
|
| 1117 |
+
7,further,0.196863,0.711818,0.061569,0.012424,23,99.0
|
| 1118 |
+
7,THE HONORABLE,0.182353,0.818485,0.154902,0.016061,24,97.0
|
| 1119 |
+
7,MAYOR OF,0.196078,0.834242,0.088235,0.013939,25,97.0
|
| 1120 |
+
7,T IO N A L,0.296863,0.066667,0.111765,0.009091,26,90.0
|
| 1121 |
+
7,Thrive locally.,0.301961,0.085152,0.108235,0.015455,27,98.0
|
| 1122 |
+
7,the tenth,0.312549,0.358182,0.069412,0.021515,28,98.0
|
| 1123 |
+
7,links of,0.30902,0.375758,0.063922,0.019394,29,98.0
|
| 1124 |
+
7,wark in unison,0.303922,0.393939,0.119608,0.014848,30,98.0
|
| 1125 |
+
7,"Daley, Mayor",0.288235,0.409697,0.110588,0.015455,31,97.0
|
| 1126 |
+
7,"Shenyang, on",0.303922,0.426364,0.102745,0.014545,32,97.0
|
| 1127 |
+
7,agreement,0.310196,0.441818,0.06549,0.015152,33,98.0
|
| 1128 |
+
7,City of Chicago,0.289412,0.471212,0.127843,0.017576,34,98.0
|
| 1129 |
+
7,mutual benefit,0.297255,0.487273,0.116863,0.017576,35,98.0
|
| 1130 |
+
7,cooperation,0.339216,0.504848,0.093725,0.014848,36,98.0
|
| 1131 |
+
7,cities do,0.316078,0.534545,0.066667,0.019697,37,98.0
|
| 1132 |
+
7,relations between,0.276471,0.553333,0.166275,0.014848,38,98.0
|
| 1133 |
+
7,"addition, exchanges",0.277255,0.582121,0.156078,0.017576,39,99.0
|
| 1134 |
+
7,cultural,0.324314,0.600606,0.061569,0.011818,40,96.0
|
| 1135 |
+
7,"addition, exchanges",0.277647,0.628182,0.156078,0.017576,41,98.0
|
| 1136 |
+
7,institutions,0.336863,0.647273,0.087059,0.01303,42,98.0
|
| 1137 |
+
7,"addition, we declare",0.276078,0.675152,0.153725,0.018182,43,99.0
|
| 1138 |
+
7,"sports, health,",0.29098,0.69303,0.116471,0.015152,44,97.0
|
| 1139 |
+
7,development of,0.258431,0.711818,0.115686,0.012424,45,99.0
|
| 1140 |
+
7,RICHARD,0.337255,0.818485,0.082745,0.016061,46,97.0
|
| 1141 |
+
7,CHICAGO,0.284314,0.834242,0.076078,0.013939,47,97.0
|
| 1142 |
+
7,REAFFIRMATION OF,0.323922,0.162727,0.199216,0.018485,48,97.0
|
| 1143 |
+
7,THE HONORABLE,0.394902,0.212727,0.160392,0.014545,49,99.0
|
| 1144 |
+
7,MAYOR,0.472157,0.23,0.070588,0.013333,50,98.0
|
| 1145 |
+
7,THE HONORABLE,0.401569,0.258788,0.163922,0.014242,51,99.0
|
| 1146 |
+
7,MAYOR,0.463529,0.272424,0.07098,0.01303,52,99.0
|
| 1147 |
+
7,anniversary of,0.381961,0.358182,0.118431,0.021515,53,98.0
|
| 1148 |
+
7,friendship between,0.372941,0.375758,0.156078,0.019394,54,98.0
|
| 1149 |
+
7,for the,0.423529,0.393939,0.063529,0.014848,55,98.0
|
| 1150 |
+
7,of the City of,0.398824,0.409697,0.11451,0.015455,56,97.0
|
| 1151 |
+
7,this fifth day,0.406667,0.426364,0.103137,0.014545,57,97.0
|
| 1152 |
+
7,between the City,0.375686,0.441818,0.138824,0.015152,58,98.0
|
| 1153 |
+
7,and the City,0.417255,0.471212,0.107059,0.017576,59,98.0
|
| 1154 |
+
7,will continue,0.414118,0.487273,0.105098,0.017576,60,98.0
|
| 1155 |
+
7,and cultural,0.432941,0.504848,0.094118,0.014848,61,98.0
|
| 1156 |
+
7,hereby declare,0.382745,0.534545,0.121569,0.019697,62,98.0
|
| 1157 |
+
7,Chicago and,0.442745,0.553333,0.066667,0.014848,63,98.0
|
| 1158 |
+
7,will be,0.433333,0.582121,0.052549,0.017576,64,99.0
|
| 1159 |
+
7,activities.,0.385882,0.600606,0.068235,0.011818,65,96.0
|
| 1160 |
+
7,will be,0.433725,0.628182,0.052549,0.017576,66,98.0
|
| 1161 |
+
7,encouraged.,0.423922,0.647273,0.09451,0.01303,67,98.0
|
| 1162 |
+
7,our intention,0.429804,0.675152,0.102745,0.018182,68,99.0
|
| 1163 |
+
7,youth and any,0.407451,0.69303,0.116471,0.015152,69,97.0
|
| 1164 |
+
7,friendship between,0.374118,0.711818,0.144314,0.012424,70,99.0
|
| 1165 |
+
7,M. DALEY,0.42,0.818485,0.080392,0.016061,71,97.0
|
| 1166 |
+
7,SISTER,0.523137,0.162727,0.07451,0.018485,72,97.0
|
| 1167 |
+
7,RICHARD,0.555294,0.212727,0.092549,0.014545,73,99.0
|
| 1168 |
+
7,OF CHICAGO,0.542745,0.23,0.115294,0.013333,74,98.0
|
| 1169 |
+
7,ZHANG,0.56549,0.258788,0.069412,0.014242,75,99.0
|
| 1170 |
+
7,OF SHENYANG,0.53451,0.272424,0.131765,0.01303,76,99.0
|
| 1171 |
+
7,ON,0.549412,0.295455,0.032157,0.016667,77,99.0
|
| 1172 |
+
7,"JUNE 5, 1995",0.500784,0.321818,0.128235,0.015758,78,96.0
|
| 1173 |
+
7,the signing of a,0.500392,0.358182,0.120784,0.021515,79,98.0
|
| 1174 |
+
7,Chicago and,0.52902,0.375758,0.098824,0.019394,80,98.0
|
| 1175 |
+
7,benefit of their,0.487059,0.393939,0.12902,0.014848,81,98.0
|
| 1176 |
+
7,"Chicago, and",0.513333,0.409697,0.106667,0.015455,82,97.0
|
| 1177 |
+
7,"of June I995,",0.509804,0.426364,0.11098,0.014545,83,97.0
|
| 1178 |
+
7,of Chicago and,0.51451,0.441818,0.103529,0.015152,84,98.0
|
| 1179 |
+
7,of Shenyang,0.524314,0.471212,0.107059,0.017576,85,98.0
|
| 1180 |
+
7,to develop a,0.519216,0.487273,0.092941,0.017576,86,98.0
|
| 1181 |
+
7,erchanges,0.527059,0.504848,0.083137,0.014848,87,98.0
|
| 1182 |
+
7,their interest in,0.504314,0.534545,0.121961,0.019697,88,98.0
|
| 1183 |
+
7,Shenyang.,0.509412,0.553333,0.076078,0.014848,89,98.0
|
| 1184 |
+
7,promoted in the,0.485882,0.582121,0.127059,0.017576,90,99.0
|
| 1185 |
+
7,promoted in,0.486275,0.628182,0.098824,0.017576,91,98.0
|
| 1186 |
+
7,to promote,0.532549,0.675152,0.086275,0.018182,92,99.0
|
| 1187 |
+
7,areas that,0.523922,0.69303,0.086275,0.015152,93,97.0
|
| 1188 |
+
7,the people of,0.518431,0.711818,0.1,0.012424,94,99.0
|
| 1189 |
+
7,THE,0.587843,0.818485,0.039216,0.013939,95,99.0
|
| 1190 |
+
7,MAYOR,0.588235,0.834545,0.061961,0.012424,96,97.0
|
| 1191 |
+
7,CITIES,0.597647,0.162727,0.119216,0.018485,97,97.0
|
| 1192 |
+
7,M. DALEY,0.647843,0.212727,0.085882,0.014545,98,99.0
|
| 1193 |
+
7,RONGMAO,0.634902,0.258788,0.09098,0.014242,99,99.0
|
| 1194 |
+
7,sister city,0.621176,0.358182,0.078431,0.021515,100,98.0
|
| 1195 |
+
7,Shenyang,0.627843,0.375758,0.080784,0.019394,101,98.0
|
| 1196 |
+
7,cities and,0.616078,0.393939,0.080392,0.014848,102,98.0
|
| 1197 |
+
7,the Honorable,0.62,0.409697,0.119608,0.015455,103,97.0
|
| 1198 |
+
7,do hereby,0.620784,0.426364,0.077647,0.014545,104,97.0
|
| 1199 |
+
7,the City of,0.618039,0.441818,0.082353,0.015152,105,98.0
|
| 1200 |
+
7,on the basis,0.631373,0.471212,0.103137,0.017576,106,98.0
|
| 1201 |
+
7,sister cities,0.612157,0.487273,0.098824,0.017576,107,98.0
|
| 1202 |
+
7,between the two,0.610196,0.504848,0.122353,0.014848,108,98.0
|
| 1203 |
+
7,exploring the,0.626275,0.534545,0.103137,0.019697,109,98.0
|
| 1204 |
+
7,area of the,0.612941,0.582121,0.090588,0.017576,110,99.0
|
| 1205 |
+
7,education and the,0.585098,0.628182,0.142353,0.017576,111,98.0
|
| 1206 |
+
7,exchanges in,0.618824,0.675152,0.102745,0.018182,112,99.0
|
| 1207 |
+
7,will contribute,0.610196,0.69303,0.114118,0.015152,113,97.0
|
| 1208 |
+
7,our two cities.,0.618431,0.711818,0.1,0.012424,114,99.0
|
| 1209 |
+
7,HONORABLE,0.627059,0.818485,0.107059,0.013939,115,99.0
|
| 1210 |
+
7,OF,0.650196,0.834545,0.027059,0.012424,116,97.0
|
| 1211 |
+
7,DECLARATION,0.716863,0.162727,0.092941,0.018485,117,97.0
|
| 1212 |
+
7,"agreement, in",0.699608,0.358182,0.104706,0.021515,118,98.0
|
| 1213 |
+
7,and to,0.708627,0.375758,0.052549,0.019394,119,98.0
|
| 1214 |
+
7,"nations, the",0.696471,0.393939,0.094902,0.014848,120,98.0
|
| 1215 |
+
7,Zhang,0.739608,0.409697,0.055294,0.015455,121,97.0
|
| 1216 |
+
7,acknowledge,0.698431,0.426364,0.1,0.014545,122,97.0
|
| 1217 |
+
7,Shenyang,0.700392,0.441818,0.072549,0.015152,123,98.0
|
| 1218 |
+
7,of friendly,0.73451,0.471212,0.08902,0.017576,124,98.0
|
| 1219 |
+
7,relationship,0.71098,0.487273,0.097647,0.017576,125,98.0
|
| 1220 |
+
7,cities.,0.732549,0.504848,0.041569,0.014848,126,98.0
|
| 1221 |
+
7,establishment,0.729412,0.534545,0.106667,0.019697,127,98.0
|
| 1222 |
+
7,arts such as,0.703529,0.582121,0.103922,0.017576,128,99.0
|
| 1223 |
+
7,establishment,0.727451,0.628182,0.107843,0.017576,129,98.0
|
| 1224 |
+
7,such fields,0.721569,0.675152,0.089412,0.018182,130,99.0
|
| 1225 |
+
7,to the,0.724314,0.69303,0.05098,0.015152,131,97.0
|
| 1226 |
+
7,ZHANG,0.734118,0.818485,0.061961,0.013939,132,99.0
|
| 1227 |
+
7,SHENYANG,0.677255,0.834545,0.088235,0.012424,133,97.0
|
| 1228 |
+
7,Toolkit,0.82902,0.069394,0.132941,0.031515,134,99.0
|
| 1229 |
+
7,order to further,0.804314,0.358182,0.127059,0.021515,135,98.0
|
| 1230 |
+
7,reaffirm their mutual,0.761176,0.375758,0.16902,0.019394,136,98.0
|
| 1231 |
+
7,Honorable Mayor,0.791373,0.393939,0.136863,0.014848,137,98.0
|
| 1232 |
+
7,"Rongmao, Mayor",0.794902,0.409697,0.133333,0.015455,138,97.0
|
| 1233 |
+
7,and reaffir.n the,0.798431,0.426364,0.128235,0.014545,139,97.0
|
| 1234 |
+
7,"cooperation,",0.823529,0.471212,0.1,0.017576,140,98.0
|
| 1235 |
+
7,to promote and,0.808627,0.487273,0.118431,0.017576,141,98.0
|
| 1236 |
+
7,of business,0.836078,0.534545,0.089412,0.019697,142,98.0
|
| 1237 |
+
7,"exhibits, music,",0.807451,0.582121,0.113725,0.017576,143,99.0
|
| 1238 |
+
7,of contacts,0.835294,0.628182,0.088235,0.017576,144,98.0
|
| 1239 |
+
7,as science and,0.81098,0.675152,0.113333,0.018182,145,99.0
|
| 1240 |
+
7,prosperity and the,0.775294,0.69303,0.147059,0.015152,146,97.0
|
| 1241 |
+
7,RONGMAO,0.796078,0.818485,0.081569,0.013939,147,99.0
|