seanpedrickcase commited on
Commit
3753e61
·
0 Parent(s):

Sync: Merge pull request #162 from seanpedrick-case/dev

Browse files

Updated with new blog post. Adjusted relevant VLM and LLM deployment parameters so that results can be replicated

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .coveragerc +56 -0
  2. .dockerignore +40 -0
  3. .gitattributes +8 -0
  4. .github/scripts/setup_test_data.py +311 -0
  5. .github/workflow_README.md +183 -0
  6. .github/workflows/archive_workflows/multi-os-test.yml +109 -0
  7. .github/workflows/ci.yml +260 -0
  8. .github/workflows/simple-test.yml +67 -0
  9. .github/workflows/sync_to_hf.yml +53 -0
  10. .github/workflows/sync_to_hf_zero_gpu.yml +53 -0
  11. .gitignore +45 -0
  12. Dockerfile +222 -0
  13. README.md +0 -0
  14. _quarto.yml +33 -0
  15. app.py +0 -0
  16. cdk/__init__.py +0 -0
  17. cdk/app.py +83 -0
  18. cdk/cdk_config.py +362 -0
  19. cdk/cdk_functions.py +1482 -0
  20. cdk/cdk_stack.py +1869 -0
  21. cdk/check_resources.py +375 -0
  22. cdk/lambda_load_dynamo_logs.py +321 -0
  23. cdk/post_cdk_build_quickstart.py +40 -0
  24. cdk/requirements.txt +5 -0
  25. cli_redact.py +0 -0
  26. docker-compose_llama.yml +211 -0
  27. docker-compose_vllm.yml +163 -0
  28. entrypoint.sh +52 -0
  29. example_app_config.env +129 -0
  30. example_data/Bold minimalist professional cover letter.docx +3 -0
  31. example_data/Difficult handwritten note.jpg +3 -0
  32. example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf +3 -0
  33. example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv +0 -0
  34. example_data/Partnership-Agreement-Toolkit_0_0.pdf +3 -0
  35. example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv +2 -0
  36. example_data/combined_case_notes.csv +19 -0
  37. example_data/combined_case_notes.xlsx +3 -0
  38. example_data/doubled_output_joined.pdf +3 -0
  39. example_data/example_complaint_letter.jpg +3 -0
  40. example_data/example_of_emails_sent_to_a_professor_before_applying.pdf +3 -0
  41. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv +277 -0
  42. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv +77 -0
  43. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv +0 -0
  44. example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv +923 -0
  45. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv +40 -0
  46. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv +432 -0
  47. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv +15 -0
  48. example_data/graduate-job-example-cover-letter.pdf +3 -0
  49. example_data/partnership_toolkit_redact_custom_deny_list.csv +2 -0
  50. example_data/partnership_toolkit_redact_some_pages.csv +2 -0
.coveragerc ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ source = .
3
+ omit =
4
+ */tests/*
5
+ */test/*
6
+ */__pycache__/*
7
+ */venv/*
8
+ */env/*
9
+ */build/*
10
+ */dist/*
11
+ */cdk/*
12
+ */docs/*
13
+ */example_data/*
14
+ */examples/*
15
+ */feedback/*
16
+ */logs/*
17
+ */old_code/*
18
+ */output/*
19
+ */tmp/*
20
+ */usage/*
21
+ */tld/*
22
+ */tesseract/*
23
+ */poppler/*
24
+ config*.py
25
+ setup.py
26
+ lambda_entrypoint.py
27
+ entrypoint.sh
28
+ cli_redact.py
29
+ load_dynamo_logs.py
30
+ load_s3_logs.py
31
+ *.spec
32
+ Dockerfile
33
+ *.qmd
34
+ *.md
35
+ *.txt
36
+ *.yml
37
+ *.yaml
38
+ *.json
39
+ *.csv
40
+ *.env
41
+ *.bat
42
+ *.ps1
43
+ *.sh
44
+
45
+ [report]
46
+ exclude_lines =
47
+ pragma: no cover
48
+ def __repr__
49
+ if self.debug:
50
+ if settings.DEBUG
51
+ raise AssertionError
52
+ raise NotImplementedError
53
+ if 0:
54
+ if __name__ == .__main__.:
55
+ class .*\bProtocol\):
56
+ @(abc\.)?abstractmethod
.dockerignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ .venv/*
5
+ examples/*
6
+ processing/*
7
+ tools/__pycache__/*
8
+ old_code/*
9
+ tesseract/*
10
+ poppler/*
11
+ build/*
12
+ dist/*
13
+ docs/*
14
+ build_deps/*
15
+ user_guide/*
16
+ cdk/config/*
17
+ tld/*
18
+ cdk/config/*
19
+ cdk/cdk.out/*
20
+ cdk/archive/*
21
+ cdk.json
22
+ cdk.context.json
23
+ .quarto/*
24
+ logs/
25
+ output/
26
+ input/
27
+ feedback/
28
+ config/
29
+ usage/
30
+ test/config/*
31
+ test/feedback/*
32
+ test/input/*
33
+ test/logs/*
34
+ test/output/*
35
+ test/tmp/*
36
+ test/usage/*
37
+ .ruff_cache/*
38
+ model_cache/*
39
+ sanitized_file/*
40
+ src/doc_redaction.egg-info/*
.gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.xls filter=lfs diff=lfs merge=lfs -text
4
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
5
+ *.docx filter=lfs diff=lfs merge=lfs -text
6
+ *.doc filter=lfs diff=lfs merge=lfs -text
7
+ *.png filter=lfs diff=lfs merge=lfs -text
8
+ *.ico filter=lfs diff=lfs merge=lfs -text
.github/scripts/setup_test_data.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for GitHub Actions test data.
4
+ Creates dummy test files when example data is not available.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ import pandas as pd
11
+
12
+
13
+ def create_directories():
14
+ """Create necessary directories."""
15
+ dirs = ["example_data", "example_data/example_outputs"]
16
+
17
+ for dir_path in dirs:
18
+ os.makedirs(dir_path, exist_ok=True)
19
+ print(f"Created directory: {dir_path}")
20
+
21
+
22
+ def create_dummy_pdf():
23
+ """Create dummy PDFs for testing."""
24
+
25
+ # Install reportlab if not available
26
+ try:
27
+ from reportlab.lib.pagesizes import letter
28
+ from reportlab.pdfgen import canvas
29
+ except ImportError:
30
+ import subprocess
31
+
32
+ subprocess.check_call(["pip", "install", "reportlab"])
33
+ from reportlab.lib.pagesizes import letter
34
+ from reportlab.pdfgen import canvas
35
+
36
+ try:
37
+ # Create the main test PDF
38
+ pdf_path = (
39
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
40
+ )
41
+ print(f"Creating PDF: {pdf_path}")
42
+ print(f"Directory exists: {os.path.exists('example_data')}")
43
+
44
+ c = canvas.Canvas(pdf_path, pagesize=letter)
45
+ c.drawString(100, 750, "This is a test document for redaction testing.")
46
+ c.drawString(100, 700, "Email: test@example.com")
47
+ c.drawString(100, 650, "Phone: 123-456-7890")
48
+ c.drawString(100, 600, "Name: John Doe")
49
+ c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
50
+ c.showPage()
51
+
52
+ # Add second page
53
+ c.drawString(100, 750, "Second page content")
54
+ c.drawString(100, 700, "More test data: jane.doe@example.com")
55
+ c.drawString(100, 650, "Another phone: 987-654-3210")
56
+ c.save()
57
+
58
+ print(f"Created dummy PDF: {pdf_path}")
59
+
60
+ # Create Partnership Agreement Toolkit PDF
61
+ partnership_pdf_path = "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
62
+ print(f"Creating PDF: {partnership_pdf_path}")
63
+ c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
64
+ c.drawString(100, 750, "Partnership Agreement Toolkit")
65
+ c.drawString(100, 700, "This is a test partnership agreement document.")
66
+ c.drawString(100, 650, "Contact: partnership@example.com")
67
+ c.drawString(100, 600, "Phone: (555) 123-4567")
68
+ c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
69
+ c.showPage()
70
+
71
+ # Add second page
72
+ c.drawString(100, 750, "Page 2 - Partnership Details")
73
+ c.drawString(100, 700, "More partnership information here.")
74
+ c.drawString(100, 650, "Contact: info@partnership.org")
75
+ c.showPage()
76
+
77
+ # Add third page
78
+ c.drawString(100, 750, "Page 3 - Terms and Conditions")
79
+ c.drawString(100, 700, "Terms and conditions content.")
80
+ c.drawString(100, 650, "Legal contact: legal@partnership.org")
81
+ c.save()
82
+
83
+ print(f"Created dummy PDF: {partnership_pdf_path}")
84
+
85
+ # Create Graduate Job Cover Letter PDF
86
+ cover_letter_pdf_path = "example_data/graduate-job-example-cover-letter.pdf"
87
+ print(f"Creating PDF: {cover_letter_pdf_path}")
88
+ c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
89
+ c.drawString(100, 750, "Cover Letter Example")
90
+ c.drawString(100, 700, "Dear Hiring Manager,")
91
+ c.drawString(100, 650, "I am writing to apply for the position.")
92
+ c.drawString(100, 600, "Contact: applicant@example.com")
93
+ c.drawString(100, 550, "Phone: (555) 987-6543")
94
+ c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
95
+ c.drawString(100, 450, "Sincerely,")
96
+ c.drawString(100, 400, "John Applicant")
97
+ c.save()
98
+
99
+ print(f"Created dummy PDF: {cover_letter_pdf_path}")
100
+
101
+ except ImportError:
102
+ print("ReportLab not available, skipping PDF creation")
103
+ # Create simple text files instead
104
+ with open(
105
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
106
+ "w",
107
+ ) as f:
108
+ f.write("This is a dummy PDF file for testing")
109
+
110
+ with open(
111
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
112
+ "w",
113
+ ) as f:
114
+ f.write("This is a dummy Partnership Agreement PDF file for testing")
115
+
116
+ with open(
117
+ "example_data/graduate-job-example-cover-letter.pdf",
118
+ "w",
119
+ ) as f:
120
+ f.write("This is a dummy cover letter PDF file for testing")
121
+
122
+ print("Created dummy text files instead of PDFs")
123
+
124
+
125
+ def create_dummy_csv():
126
+ """Create dummy CSV files for testing."""
127
+ # Main CSV
128
+ csv_data = {
129
+ "Case Note": [
130
+ "Client visited for consultation regarding housing issues",
131
+ "Follow-up appointment scheduled for next week",
132
+ "Documentation submitted for review",
133
+ ],
134
+ "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
135
+ "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
136
+ }
137
+ df = pd.DataFrame(csv_data)
138
+ df.to_csv("example_data/combined_case_notes.csv", index=False)
139
+ print("Created dummy CSV: example_data/combined_case_notes.csv")
140
+
141
+ # Lambeth CSV
142
+ lambeth_data = {
143
+ "text": [
144
+ "Lambeth 2030 vision document content",
145
+ "Our Future Our Lambeth strategic plan",
146
+ "Community engagement and development",
147
+ ],
148
+ "page": [1, 2, 3],
149
+ }
150
+ df_lambeth = pd.DataFrame(lambeth_data)
151
+ df_lambeth.to_csv(
152
+ "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False
153
+ )
154
+ print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
155
+
156
+
157
+ def create_dummy_word_doc():
158
+ """Create dummy Word document."""
159
+ try:
160
+ from docx import Document
161
+
162
+ doc = Document()
163
+ doc.add_heading("Test Document for Redaction", 0)
164
+ doc.add_paragraph("This is a test document for redaction testing.")
165
+ doc.add_paragraph("Contact Information:")
166
+ doc.add_paragraph("Email: test@example.com")
167
+ doc.add_paragraph("Phone: 123-456-7890")
168
+ doc.add_paragraph("Name: John Doe")
169
+ doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
170
+
171
+ doc.save("example_data/Bold minimalist professional cover letter.docx")
172
+ print("Created dummy Word document")
173
+
174
+ except ImportError:
175
+ print("python-docx not available, skipping Word document creation")
176
+
177
+
178
+ def create_allow_deny_lists():
179
+ """Create dummy allow/deny lists."""
180
+ # Allow lists
181
+ allow_data = {"word": ["test", "example", "document"]}
182
+ pd.DataFrame(allow_data).to_csv(
183
+ "example_data/test_allow_list_graduate.csv", index=False
184
+ )
185
+ pd.DataFrame(allow_data).to_csv(
186
+ "example_data/test_allow_list_partnership.csv", index=False
187
+ )
188
+ print("Created allow lists")
189
+
190
+ # Deny lists
191
+ deny_data = {"word": ["sensitive", "confidential", "private"]}
192
+ pd.DataFrame(deny_data).to_csv(
193
+ "example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False
194
+ )
195
+ pd.DataFrame(deny_data).to_csv(
196
+ "example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
197
+ index=False,
198
+ )
199
+ print("Created deny lists")
200
+
201
+ # Whole page redaction list
202
+ page_data = {"page": [1, 2]}
203
+ pd.DataFrame(page_data).to_csv(
204
+ "example_data/partnership_toolkit_redact_some_pages.csv", index=False
205
+ )
206
+ print("Created whole page redaction list")
207
+
208
+
209
+ def create_ocr_output():
210
+ """Create dummy OCR output CSV."""
211
+ ocr_data = {
212
+ "page": [1, 2, 3],
213
+ "text": [
214
+ "This is page 1 content with some text",
215
+ "This is page 2 content with different text",
216
+ "This is page 3 content with more text",
217
+ ],
218
+ "left": [0.1, 0.3, 0.5],
219
+ "top": [0.95, 0.92, 0.88],
220
+ "width": [0.05, 0.02, 0.02],
221
+ "height": [0.01, 0.02, 0.02],
222
+ "line": [1, 2, 3],
223
+ }
224
+ df = pd.DataFrame(ocr_data)
225
+ df.to_csv(
226
+ "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
227
+ index=False,
228
+ )
229
+ print("Created dummy OCR output CSV")
230
+
231
+
232
+ def create_dummy_image():
233
+ """Create dummy image for testing."""
234
+ try:
235
+ from PIL import Image, ImageDraw, ImageFont
236
+
237
+ img = Image.new("RGB", (800, 600), color="white")
238
+ draw = ImageDraw.Draw(img)
239
+
240
+ # Try to use a system font
241
+ try:
242
+ font = ImageFont.truetype(
243
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
244
+ )
245
+ except Exception as e:
246
+ print(f"Error loading DejaVuSans font: {e}")
247
+ try:
248
+ font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
249
+ except Exception as e:
250
+ print(f"Error loading Arial font: {e}")
251
+ font = ImageFont.load_default()
252
+
253
+ # Add text to image
254
+ draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
255
+ draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
256
+ draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
257
+ draw.text((50, 200), "Name: John Doe", fill="black", font=font)
258
+ draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
259
+
260
+ img.save("example_data/example_complaint_letter.jpg")
261
+ print("Created dummy image")
262
+
263
+ except ImportError:
264
+ print("PIL not available, skipping image creation")
265
+
266
+
267
+ def main():
268
+ """Main setup function."""
269
+ print("Setting up test data for GitHub Actions...")
270
+ print(f"Current working directory: {os.getcwd()}")
271
+ print(f"Python version: {sys.version}")
272
+
273
+ create_directories()
274
+ create_dummy_pdf()
275
+ create_dummy_csv()
276
+ create_dummy_word_doc()
277
+ create_allow_deny_lists()
278
+ create_ocr_output()
279
+ create_dummy_image()
280
+
281
+ print("\nTest data setup complete!")
282
+ print("Created files:")
283
+ for root, dirs, files in os.walk("example_data"):
284
+ for file in files:
285
+ file_path = os.path.join(root, file)
286
+ print(f" {file_path}")
287
+ # Verify the file exists and has content
288
+ if os.path.exists(file_path):
289
+ file_size = os.path.getsize(file_path)
290
+ print(f" Size: {file_size} bytes")
291
+ else:
292
+ print(" WARNING: File does not exist!")
293
+
294
+ # Verify critical files exist
295
+ critical_files = [
296
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
297
+ "example_data/graduate-job-example-cover-letter.pdf",
298
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
299
+ ]
300
+
301
+ print("\nVerifying critical test files:")
302
+ for file_path in critical_files:
303
+ if os.path.exists(file_path):
304
+ file_size = os.path.getsize(file_path)
305
+ print(f"✅ {file_path} exists ({file_size} bytes)")
306
+ else:
307
+ print(f"❌ {file_path} MISSING!")
308
+
309
+
310
+ if __name__ == "__main__":
311
+ main()
.github/workflow_README.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GitHub Actions CI/CD Setup
2
+
3
+ This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
4
+
5
+ ## Workflows Overview
6
+
7
+ ### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
8
+ - **Purpose**: Basic test execution
9
+ - **Triggers**: Push to main/dev, Pull requests
10
+ - **OS**: Ubuntu Latest
11
+ - **Python**: 3.11
12
+ - **Features**:
13
+ - Installs system dependencies
14
+ - Sets up test data
15
+ - Runs CLI tests
16
+ - Runs pytest
17
+
18
+ ### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
19
+ - **Purpose**: Full CI/CD pipeline
20
+ - **Features**:
21
+ - Linting (Ruff, Black)
22
+ - Unit tests (Python 3.10, 3.11, 3.12)
23
+ - Integration tests
24
+ - Security scanning (Safety, Bandit)
25
+ - Coverage reporting
26
+ - Package building (on main branch)
27
+
28
+ ### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
29
+ - **Purpose**: Cross-platform testing
30
+ - **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
31
+ - **Python**: 3.10, 3.11, 3.12
32
+ - **Features**: Tests compatibility across different operating systems
33
+
34
+ ### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
35
+ - **Purpose**: Original test workflow
36
+ - **Features**:
37
+ - Multiple Python versions
38
+ - System dependency installation
39
+ - Test data creation
40
+ - Coverage reporting
41
+
42
+ ## Setup Scripts
43
+
44
+ ### Test Data Setup (`.github/scripts/setup_test_data.py`)
45
+ Creates dummy test files when example data is not available:
46
+ - PDF documents
47
+ - CSV files
48
+ - Word documents
49
+ - Images
50
+ - Allow/deny lists
51
+ - OCR output files
52
+
53
+ ## Usage
54
+
55
+ ### Running Tests Locally
56
+
57
+ ```bash
58
+ # Install dependencies
59
+ pip install -r requirements.txt
60
+ pip install pytest pytest-cov
61
+
62
+ # Setup test data
63
+ python .github/scripts/setup_test_data.py
64
+
65
+ # Run tests
66
+ cd test
67
+ python test.py
68
+ ```
69
+
70
+ ### GitHub Actions Triggers
71
+
72
+ 1. **Push to main/dev**: Runs all tests
73
+ 2. **Pull Request**: Runs tests and linting
74
+ 3. **Daily Schedule**: Runs tests at 2 AM UTC
75
+ 4. **Manual Trigger**: Can be triggered manually from GitHub
76
+
77
+ ## Configuration
78
+
79
+ ### Environment Variables
80
+ - `PYTHON_VERSION`: Default Python version (3.11)
81
+ - `PYTHONPATH`: Set automatically for test discovery
82
+
83
+ ### Caching
84
+ - Pip dependencies are cached for faster builds
85
+ - Cache key based on requirements.txt hash
86
+
87
+ ### Artifacts
88
+ - Test results (JUnit XML)
89
+ - Coverage reports (HTML, XML)
90
+ - Security reports
91
+ - Build artifacts (on main branch)
92
+
93
+ ## Test Data
94
+
95
+ The workflows automatically create test data when example files are missing:
96
+
97
+ ### Required Files Created:
98
+ - `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
99
+ - `example_data/combined_case_notes.csv`
100
+ - `example_data/Bold minimalist professional cover letter.docx`
101
+ - `example_data/example_complaint_letter.jpg`
102
+ - `example_data/test_allow_list_*.csv`
103
+ - `example_data/partnership_toolkit_redact_*.csv`
104
+ - `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
105
+
106
+ ### Dependencies Installed:
107
+ - **System**: tesseract-ocr, poppler-utils, OpenGL libraries
108
+ - **Python**: All requirements.txt packages + pytest, reportlab, pillow
109
+
110
+ ## Workflow Status
111
+
112
+ ### Success Criteria:
113
+ - ✅ All tests pass
114
+ - ✅ No linting errors
115
+ - ✅ Security checks pass
116
+ - ✅ Coverage meets threshold (if configured)
117
+
118
+ ### Failure Handling:
119
+ - Tests are designed to skip gracefully if files are missing
120
+ - AWS tests are expected to fail without credentials
121
+ - System dependency failures are handled with fallbacks
122
+
123
+ ## Customization
124
+
125
+ ### Adding New Tests:
126
+ 1. Add test methods to `test/test.py`
127
+ 2. Update test data in `setup_test_data.py` if needed
128
+ 3. Tests will automatically run in all workflows
129
+
130
+ ### Modifying Workflows:
131
+ 1. Edit the appropriate `.yml` file
132
+ 2. Test locally first
133
+ 3. Push to trigger the workflow
134
+
135
+ ### Environment-Specific Settings:
136
+ - **Ubuntu**: Full system dependencies
137
+ - **Windows**: Python packages only
138
+ - **macOS**: Homebrew dependencies
139
+
140
+ ## Troubleshooting
141
+
142
+ ### Common Issues:
143
+
144
+ 1. **Missing Dependencies**:
145
+ - Check system dependency installation
146
+ - Verify Python package versions
147
+
148
+ 2. **Test Failures**:
149
+ - Check test data creation
150
+ - Verify file paths
151
+ - Review test output logs
152
+
153
+ 3. **AWS Test Failures**:
154
+ - Expected without credentials
155
+ - Tests are designed to handle this gracefully
156
+
157
+ 4. **System Dependency Issues**:
158
+ - Different OS have different requirements
159
+ - Check the specific OS section in workflows
160
+
161
+ ### Debug Mode:
162
+ Add `--verbose` or `-v` flags to pytest commands for more detailed output.
163
+
164
+ ## Security
165
+
166
+ - Dependencies are scanned with Safety
167
+ - Code is scanned with Bandit
168
+ - No secrets are exposed in logs
169
+ - Test data is temporary and cleaned up
170
+
171
+ ## Performance
172
+
173
+ - Tests run in parallel where possible
174
+ - Dependencies are cached
175
+ - Only necessary system packages are installed
176
+ - Test data is created efficiently
177
+
178
+ ## Monitoring
179
+
180
+ - Workflow status is visible in GitHub Actions tab
181
+ - Coverage reports are uploaded to Codecov
182
+ - Test results are available as artifacts
183
+ - Security reports are generated and stored
.github/workflows/archive_workflows/multi-os-test.yml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Multi-OS Test
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ permissions:
10
+ contents: read
11
+ actions: read
12
+
13
+ jobs:
14
+ test:
15
+ runs-on: ${{ matrix.os }}
16
+ strategy:
17
+ matrix:
18
+ os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
19
+ python-version: ["3.11", "3.12", "3.13"]
20
+ exclude:
21
+ # Exclude some combinations to reduce CI time
22
+ #- os: windows-latest
23
+ # python-version: ["3.12", "3.13"]
24
+ - os: macos-latest
25
+ python-version: ["3.12", "3.13"]
26
+
27
+ steps:
28
+ - uses: actions/checkout@v6
29
+
30
+ - name: Set up Python ${{ matrix.python-version }}
31
+ uses: actions/setup-python@v6
32
+ with:
33
+ python-version: ${{ matrix.python-version }}
34
+
35
+ - name: Install system dependencies (Ubuntu)
36
+ if: matrix.os == 'ubuntu-latest'
37
+ run: |
38
+ sudo apt-get update
39
+ sudo apt-get install -y \
40
+ tesseract-ocr \
41
+ tesseract-ocr-eng \
42
+ poppler-utils \
43
+ libgl1-mesa-dri \
44
+ libglib2.0-0
45
+
46
+ - name: Install system dependencies (macOS)
47
+ if: matrix.os == 'macos-latest'
48
+ run: |
49
+ brew install tesseract poppler
50
+
51
+ - name: Install system dependencies (Windows)
52
+ if: matrix.os == 'windows-latest'
53
+ run: |
54
+ # Create tools directory
55
+ if (!(Test-Path "C:\tools")) {
56
+ mkdir C:\tools
57
+ }
58
+
59
+ # Download and install Tesseract
60
+ $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
61
+ $tesseractInstaller = "C:\tools\tesseract-installer.exe"
62
+ Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
63
+
64
+ # Install Tesseract silently
65
+ Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
66
+
67
+ # Download and extract Poppler
68
+ $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
69
+ $popplerZip = "C:\tools\poppler.zip"
70
+ Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
71
+
72
+ # Extract Poppler
73
+ Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
74
+
75
+ # Add to PATH
76
+ echo "C:\tools\tesseract" >> $env:GITHUB_PATH
77
+ echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
78
+
79
+ # Set environment variables for your application
80
+ echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
81
+ echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
82
+ echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
83
+
84
+ # Verify installation using full paths (since PATH won't be updated in current session)
85
+ & "C:\tools\tesseract\tesseract.exe" --version
86
+ & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
87
+
88
+ - name: Install Python dependencies
89
+ run: |
90
+ python -m pip install --upgrade pip
91
+ pip install -r requirements.txt
92
+ pip install pytest pytest-cov reportlab pillow
93
+
94
+ - name: Download spaCy model
95
+ run: |
96
+ python -m spacy download en_core_web_lg
97
+
98
+ - name: Setup test data
99
+ run: |
100
+ python .github/scripts/setup_test_data.py
101
+
102
+ - name: Run CLI tests
103
+ run: |
104
+ cd test
105
+ python test.py
106
+
107
+ - name: Run tests with pytest
108
+ run: |
109
+ pytest test/test.py -v --tb=short
.github/workflows/ci.yml ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ #schedule:
9
+ # Run tests daily at 2 AM UTC
10
+ # - cron: '0 2 * * *'
11
+
12
+ permissions:
13
+ contents: read
14
+ actions: read
15
+ pull-requests: write
16
+ issues: write
17
+
18
+ env:
19
+ PYTHON_VERSION: "3.11"
20
+
21
+ jobs:
22
+ lint:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - uses: actions/checkout@v6
26
+
27
+ - name: Set up Python
28
+ uses: actions/setup-python@v6
29
+ with:
30
+ python-version: ${{ env.PYTHON_VERSION }}
31
+
32
+ - name: Install dependencies
33
+ run: |
34
+ python -m pip install --upgrade pip
35
+ pip install ruff black
36
+
37
+ - name: Run Ruff linter
38
+ run: ruff check .
39
+
40
+ - name: Run Black formatter check
41
+ run: black --check .
42
+
43
+ test-unit:
44
+ runs-on: ubuntu-latest
45
+ strategy:
46
+ matrix:
47
+ python-version: [3.11, 3.12, 3.13]
48
+
49
+ steps:
50
+ - uses: actions/checkout@v6
51
+
52
+ - name: Set up Python ${{ matrix.python-version }}
53
+ uses: actions/setup-python@v6
54
+ with:
55
+ python-version: ${{ matrix.python-version }}
56
+
57
+ - name: Cache pip dependencies
58
+ uses: actions/cache@v5
59
+ with:
60
+ path: ~/.cache/pip
61
+ key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
62
+ restore-keys: |
63
+ ${{ runner.os }}-pip-
64
+
65
+ - name: Install system dependencies
66
+ run: |
67
+ sudo apt-get update
68
+ sudo apt-get install -y \
69
+ tesseract-ocr \
70
+ tesseract-ocr-eng \
71
+ poppler-utils \
72
+ libgl1-mesa-dri \
73
+ libglib2.0-0 \
74
+ libsm6 \
75
+ libxext6 \
76
+ libxrender-dev \
77
+ libgomp1
78
+
79
+ - name: Install Python dependencies
80
+ run: |
81
+ python -m pip install --upgrade pip
82
+ pip install -r requirements_lightweight.txt
83
+ pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
84
+
85
+ - name: Download spaCy model
86
+ run: |
87
+ python -m spacy download en_core_web_lg
88
+
89
+ - name: Setup test data
90
+ run: |
91
+ python .github/scripts/setup_test_data.py
92
+ echo "Setup script completed. Checking results:"
93
+ ls -la example_data/ || echo "example_data directory not found"
94
+
95
+ - name: Verify test data files
96
+ run: |
97
+ echo "Checking if critical test files exist:"
98
+ ls -la example_data/
99
+ echo "Checking for specific PDF files:"
100
+ ls -la example_data/*.pdf || echo "No PDF files found"
101
+ echo "Checking file sizes:"
102
+ find example_data -name "*.pdf" -exec ls -lh {} \;
103
+
104
+ - name: Clean up problematic config files
105
+ run: |
106
+ rm -f config*.py || true
107
+
108
+ - name: Run CLI tests
109
+ run: |
110
+ cd test
111
+ python test.py
112
+
113
+ - name: Run tests with pytest
114
+ run: |
115
+ pytest test/test.py -v --tb=short --junitxml=test-results.xml
116
+
117
+ - name: Run tests with coverage
118
+ run: |
119
+ pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
120
+
121
+ #- name: Upload coverage to Codecov - not necessary
122
+ # uses: codecov/codecov-action@v3
123
+ # if: matrix.python-version == '3.11'
124
+ # with:
125
+ # file: ./coverage.xml
126
+ # flags: unittests
127
+ # name: codecov-umbrella
128
+ # fail_ci_if_error: false
129
+
130
+ - name: Upload test results
131
+ uses: actions/upload-artifact@v6
132
+ if: always()
133
+ with:
134
+ name: test-results-python-${{ matrix.python-version }}
135
+ path: |
136
+ test-results.xml
137
+ htmlcov/
138
+ coverage.xml
139
+
140
+ test-integration:
141
+ runs-on: ubuntu-latest
142
+ needs: [lint, test-unit]
143
+
144
+ steps:
145
+ - uses: actions/checkout@v6
146
+
147
+ - name: Set up Python
148
+ uses: actions/setup-python@v6
149
+ with:
150
+ python-version: ${{ env.PYTHON_VERSION }}
151
+
152
+ - name: Install dependencies
153
+ run: |
154
+ python -m pip install --upgrade pip
155
+ pip install -r requirements_lightweight.txt
156
+ pip install pytest pytest-cov reportlab pillow
157
+
158
+ - name: Install system dependencies
159
+ run: |
160
+ sudo apt-get update
161
+ sudo apt-get install -y \
162
+ tesseract-ocr \
163
+ tesseract-ocr-eng \
164
+ poppler-utils \
165
+ libgl1-mesa-dri \
166
+ libglib2.0-0
167
+
168
+ - name: Download spaCy model
169
+ run: |
170
+ python -m spacy download en_core_web_lg
171
+
172
+ - name: Setup test data
173
+ run: |
174
+ python .github/scripts/setup_test_data.py
175
+ echo "Setup script completed. Checking results:"
176
+ ls -la example_data/ || echo "example_data directory not found"
177
+
178
+ - name: Verify test data files
179
+ run: |
180
+ echo "Checking if critical test files exist:"
181
+ ls -la example_data/
182
+ echo "Checking for specific PDF files:"
183
+ ls -la example_data/*.pdf || echo "No PDF files found"
184
+ echo "Checking file sizes:"
185
+ find example_data -name "*.pdf" -exec ls -lh {} \;
186
+
187
+ - name: Run integration tests
188
+ run: |
189
+ cd test
190
+ python demo_single_test.py
191
+
192
+ - name: Test CLI help
193
+ run: |
194
+ python cli_redact.py --help
195
+
196
+ - name: Test CLI version
197
+ run: |
198
+ python -c "import sys; print(f'Python {sys.version}')"
199
+
200
+ security:
201
+ runs-on: ubuntu-latest
202
+ steps:
203
+ - uses: actions/checkout@v6
204
+
205
+ - name: Set up Python
206
+ uses: actions/setup-python@v6
207
+ with:
208
+ python-version: ${{ env.PYTHON_VERSION }}
209
+
210
+ - name: Install dependencies
211
+ run: |
212
+ python -m pip install --upgrade pip
213
+ pip install safety bandit
214
+
215
+ #- name: Run safety scan - removed as now requires login
216
+ # run: |
217
+ # safety scan -r requirements.txt
218
+
219
+ - name: Run bandit security check
220
+ run: |
221
+ bandit -r . -f json -o bandit-report.json || true
222
+
223
+ - name: Upload security report
224
+ uses: actions/upload-artifact@v6
225
+ if: always()
226
+ with:
227
+ name: security-report
228
+ path: bandit-report.json
229
+
230
+ build:
231
+ runs-on: ubuntu-latest
232
+ needs: [lint, test-unit]
233
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
234
+
235
+ steps:
236
+ - uses: actions/checkout@v6
237
+
238
+ - name: Set up Python
239
+ uses: actions/setup-python@v6
240
+ with:
241
+ python-version: ${{ env.PYTHON_VERSION }}
242
+
243
+ - name: Install build dependencies
244
+ run: |
245
+ python -m pip install --upgrade pip
246
+ pip install build twine
247
+
248
+ - name: Build package
249
+ run: |
250
+ python -m build
251
+
252
+ - name: Check package
253
+ run: |
254
+ twine check dist/*
255
+
256
+ - name: Upload build artifacts
257
+ uses: actions/upload-artifact@v6
258
+ with:
259
+ name: dist
260
+ path: dist/
.github/workflows/simple-test.yml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Simple Test Run
2
+
3
+ on:
4
+ push:
5
+ branches: [ dev ]
6
+ pull_request:
7
+ branches: [ dev ]
8
+
9
+ permissions:
10
+ contents: read
11
+ actions: read
12
+
13
+ jobs:
14
+ test:
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v6
19
+
20
+ - name: Set up Python 3.12
21
+ uses: actions/setup-python@v6
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Install system dependencies
26
+ run: |
27
+ sudo apt-get update
28
+ sudo apt-get install -y \
29
+ tesseract-ocr \
30
+ tesseract-ocr-eng \
31
+ poppler-utils \
32
+ libgl1-mesa-dri \
33
+ libglib2.0-0
34
+
35
+ - name: Install Python dependencies
36
+ run: |
37
+ python -m pip install --upgrade pip
38
+ pip install -r requirements_lightweight.txt
39
+ pip install pytest pytest-cov reportlab pillow
40
+
41
+ - name: Download spaCy model
42
+ run: |
43
+ python -m spacy download en_core_web_lg
44
+
45
+ - name: Setup test data
46
+ run: |
47
+ python .github/scripts/setup_test_data.py
48
+ echo "Setup script completed. Checking results:"
49
+ ls -la example_data/ || echo "example_data directory not found"
50
+
51
+ - name: Verify test data files
52
+ run: |
53
+ echo "Checking if critical test files exist:"
54
+ ls -la example_data/
55
+ echo "Checking for specific PDF files:"
56
+ ls -la example_data/*.pdf || echo "No PDF files found"
57
+ echo "Checking file sizes:"
58
+ find example_data -name "*.pdf" -exec ls -lh {} \;
59
+
60
+ - name: Run CLI tests
61
+ run: |
62
+ cd test
63
+ python test.py
64
+
65
+ - name: Run tests with pytest
66
+ run: |
67
+ pytest test/test.py -v --tb=short
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ permissions:
7
+ contents: read
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v6
14
+ with:
15
+ fetch-depth: 1 # Only get the latest state
16
+ lfs: true # Download actual LFS files so they can be pushed
17
+
18
+ - name: Install Git LFS
19
+ run: git lfs install
20
+
21
+ - name: Recreate repo history (single-commit force push)
22
+ run: |
23
+ # 1. Capture the message BEFORE we delete the .git folder
24
+ COMMIT_MSG=$(git log -1 --pretty=%B)
25
+ echo "Syncing commit message: $COMMIT_MSG"
26
+
27
+ # 2. DELETE the .git folder.
28
+ # This turns the repo into a standard folder of files.
29
+ rm -rf .git
30
+
31
+ # 3. Re-initialize a brand new git repo
32
+ git init -b main
33
+ git config --global user.name "$HF_USERNAME"
34
+ git config --global user.email "$HF_EMAIL"
35
+
36
+ # 4. Re-install LFS (needs to be done after git init)
37
+ git lfs install
38
+
39
+ # 5. Add the remote
40
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
41
+
42
+ # 6. Add all files
43
+ # Since this is a fresh init, Git sees EVERY file as "New"
44
+ git add .
45
+
46
+ # 7. Commit and Force Push
47
+ git commit -m "Sync: $COMMIT_MSG"
48
+ git push --force hf main
49
+ env:
50
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
51
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
52
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
53
+ HF_REPO_ID: ${{ secrets.HF_REPO_ID }}
.github/workflows/sync_to_hf_zero_gpu.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub Zero GPU
2
+ on:
3
+ push:
4
+ branches: [dev]
5
+
6
+ permissions:
7
+ contents: read
8
+
9
+ jobs:
10
+ sync-to-hub-zero-gpu:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v6
14
+ with:
15
+ fetch-depth: 1 # Only get the latest state
16
+ lfs: true # Download actual LFS files so they can be pushed
17
+
18
+ - name: Install Git LFS
19
+ run: git lfs install
20
+
21
+ - name: Recreate repo history (single-commit force push)
22
+ run: |
23
+ # 1. Capture the message BEFORE we delete the .git folder
24
+ COMMIT_MSG=$(git log -1 --pretty=%B)
25
+ echo "Syncing commit message: $COMMIT_MSG"
26
+
27
+ # 2. DELETE the .git folder.
28
+ # This turns the repo into a standard folder of files.
29
+ rm -rf .git
30
+
31
+ # 3. Re-initialize a brand new git repo
32
+ git init -b main
33
+ git config --global user.name "$HF_USERNAME"
34
+ git config --global user.email "$HF_EMAIL"
35
+
36
+ # 4. Re-install LFS (needs to be done after git init)
37
+ git lfs install
38
+
39
+ # 5. Add the remote
40
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
41
+
42
+ # 6. Add all files
43
+ # Since this is a fresh init, Git sees EVERY file as "New"
44
+ git add .
45
+
46
+ # 7. Commit and Force Push
47
+ git commit -m "Sync: $COMMIT_MSG"
48
+ git push --force hf main
49
+ env:
50
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
51
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
52
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
53
+ HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}
.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ .venv/*
5
+ examples/*
6
+ processing/*
7
+ input/*
8
+ output/*
9
+ tools/__pycache__/*
10
+ old_code/*
11
+ tesseract/*
12
+ poppler/*
13
+ build/*
14
+ dist/*
15
+ build_deps/*
16
+ logs/*
17
+ usage/*
18
+ feedback/*
19
+ config/*
20
+ user_guide/*
21
+ cdk/config/*
22
+ cdk/cdk.out/*
23
+ cdk/archive/*
24
+ tld/*
25
+ tmp/*
26
+ docs/*
27
+ cdk.out/*
28
+ cdk.json
29
+ cdk.context.json
30
+ .quarto/*
31
+ /.quarto/
32
+ /_site/
33
+ test/config/*
34
+ test/feedback/*
35
+ test/input/*
36
+ test/logs/*
37
+ test/output/*
38
+ test/tmp/*
39
+ test/usage/*
40
+ .ruff_cache/*
41
+ model_cache/*
42
+ sanitized_file/*
43
+ src/doc_redaction.egg-info/*
44
+
45
+ **/*.quarto_ipynb
Dockerfile ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
3
+
4
+ # Install system dependencies
5
+ RUN apt-get update \
6
+ && apt-get upgrade -y \
7
+ && apt-get install -y --no-install-recommends \
8
+ g++ \
9
+ make \
10
+ cmake \
11
+ unzip \
12
+ libcurl4-openssl-dev \
13
+ git \
14
+ && pip install --upgrade pip \
15
+ && apt-get clean \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /src
19
+
20
+ COPY requirements_lightweight.txt .
21
+
22
+ RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
23
+
24
+ # Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
25
+
26
+ ARG INSTALL_PADDLEOCR=False
27
+ ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
28
+
29
+ ARG PADDLE_GPU_ENABLED=False
30
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
31
+
32
+ RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
33
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
34
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
35
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
36
+ elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
37
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
38
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle-gpu<=3.2.1" --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ && \
39
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
40
+ fi
41
+
42
+ ARG INSTALL_VLM=False
43
+ ENV INSTALL_VLM=${INSTALL_VLM}
44
+
45
+ ARG TORCH_GPU_ENABLED=False
46
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
47
+
48
+ # Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
49
+ RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
50
+ pip install --verbose --no-cache-dir --target=/install \
51
+ "torch==2.9.1+cpu" \
52
+ "torchvision==0.24.1+cpu" \
53
+ "transformers<=5.30.0" \
54
+ "accelerate<=1.13.0" \
55
+ "bitsandbytes<=0.49.2" \
56
+ "sentencepiece<=0.2.1" \
57
+ --extra-index-url https://download.pytorch.org/whl/cpu; \
58
+ elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
59
+ pip install --verbose --no-cache-dir --target=/install "torch<=2.8.0" --index-url https://download.pytorch.org/whl/cu129 && \
60
+ pip install --verbose --no-cache-dir --target=/install "torchvision<=0.23.0" --index-url https://download.pytorch.org/whl/cu129 && \
61
+ pip install --verbose --no-cache-dir --target=/install \
62
+ "transformers<=5.30.0" \
63
+ "accelerate<=1.13.0" \
64
+ "bitsandbytes<=0.49.2" \
65
+ "sentencepiece<=0.2.1" && \
66
+ pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
67
+ pip install --verbose --no-cache-dir --target=/install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
68
+ pip install --verbose --no-cache-dir --target=/install https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
69
+ fi
70
+
71
+ # ===================================================================
72
+ # Stage 2: A common base for both Lambda and Gradio
73
+ # ===================================================================
74
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
75
+
76
+ # MUST re-declare ARGs in every stage where they are used in RUN commands
77
+ ARG TORCH_GPU_ENABLED=False
78
+ ARG PADDLE_GPU_ENABLED=False
79
+
80
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
81
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
82
+
83
+ RUN apt-get update && apt-get install -y --no-install-recommends \
84
+ tesseract-ocr \
85
+ poppler-utils \
86
+ libgl1 \
87
+ libglib2.0-0 && \
88
+ if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
89
+ apt-get install -y --no-install-recommends libgomp1; \
90
+ fi && \
91
+ apt-get clean && rm -rf /var/lib/apt/lists/*
92
+
93
+ ENV APP_HOME=/home/user
94
+
95
+ # Set env variables for Gradio & other apps
96
+ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
97
+ TLDEXTRACT_CACHE=/tmp/tld/ \
98
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
99
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
100
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
101
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
102
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
103
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
104
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
105
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
106
+ TESSERACT_DATA_FOLDER=/usr/share/tessdata \
107
+ GRADIO_SERVER_NAME=0.0.0.0 \
108
+ GRADIO_SERVER_PORT=7860 \
109
+ PATH=$APP_HOME/.local/bin:$PATH \
110
+ PYTHONPATH=$APP_HOME/app \
111
+ PYTHONUNBUFFERED=1 \
112
+ PYTHONDONTWRITEBYTECODE=1 \
113
+ GRADIO_ALLOW_FLAGGING=never \
114
+ GRADIO_NUM_PORTS=1 \
115
+ GRADIO_ANALYTICS_ENABLED=False
116
+
117
+ # Copy Python packages from the builder stage
118
+ COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
119
+ COPY --from=builder /install/bin /usr/local/bin/
120
+
121
+ # Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
122
+ # passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
123
+ RUN pip install --no-cache-dir "protobuf<=7.34.0"
124
+
125
+ # Copy your application code and entrypoint
126
+ COPY . ${APP_HOME}/app
127
+ COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
128
+ # Fix line endings and set execute permissions
129
+ RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
130
+ && chmod +x ${APP_HOME}/app/entrypoint.sh
131
+
132
+ WORKDIR ${APP_HOME}/app
133
+
134
+ # ===================================================================
135
+ # FINAL Stage 3: The Lambda Image (runs as root for simplicity)
136
+ # ===================================================================
137
+ FROM base AS lambda
138
+ # Set runtime ENV for Lambda mode
139
+ ENV APP_MODE=lambda
140
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
141
+ CMD ["lambda_entrypoint.lambda_handler"]
142
+
143
+ # ===================================================================
144
+ # FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
145
+ # ===================================================================
146
+ FROM base AS gradio
147
+ # Set runtime ENV for Gradio mode
148
+ ENV APP_MODE=gradio
149
+
150
+ # Create non-root user
151
+ RUN useradd -m -u 1000 user
152
+
153
+ # Create the base application directory and set its ownership
154
+ RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
155
+
156
+ # Create required sub-folders within the app directory and set their permissions
157
+ # This ensures these specific directories are owned by 'user'
158
+ RUN mkdir -p \
159
+ ${APP_HOME}/app/output \
160
+ ${APP_HOME}/app/input \
161
+ ${APP_HOME}/app/logs \
162
+ ${APP_HOME}/app/usage \
163
+ ${APP_HOME}/app/feedback \
164
+ ${APP_HOME}/app/config \
165
+ && chown user:user \
166
+ ${APP_HOME}/app/output \
167
+ ${APP_HOME}/app/input \
168
+ ${APP_HOME}/app/logs \
169
+ ${APP_HOME}/app/usage \
170
+ ${APP_HOME}/app/feedback \
171
+ ${APP_HOME}/app/config \
172
+ && chmod 755 \
173
+ ${APP_HOME}/app/output \
174
+ ${APP_HOME}/app/input \
175
+ ${APP_HOME}/app/logs \
176
+ ${APP_HOME}/app/usage \
177
+ ${APP_HOME}/app/feedback \
178
+ ${APP_HOME}/app/config
179
+
180
+ # Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
181
+ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
182
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
183
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
184
+ && chmod 700 ${XDG_CACHE_HOME} \
185
+ && mkdir -p ${APP_HOME}/.paddlex \
186
+ && chown user:user ${APP_HOME}/.paddlex \
187
+ && chmod 755 ${APP_HOME}/.paddlex \
188
+ && mkdir -p ${APP_HOME}/.local/share/spacy/data \
189
+ && chown user:user ${APP_HOME}/.local/share/spacy/data \
190
+ && chmod 755 ${APP_HOME}/.local/share/spacy/data \
191
+ && mkdir -p /usr/share/tessdata \
192
+ && chown user:user /usr/share/tessdata \
193
+ && chmod 755 /usr/share/tessdata
194
+
195
+ # Fix apply user ownership to all files in the home directory
196
+ RUN chown -R user:user /home/user
197
+
198
+ # Set permissions for Python executable
199
+ RUN chmod 755 /usr/local/bin/python
200
+
201
+ # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
202
+ VOLUME ["/tmp/matplotlib_cache"]
203
+ VOLUME ["/tmp/gradio_tmp"]
204
+ VOLUME ["/tmp/tld"]
205
+ VOLUME ["/home/user/app/output"]
206
+ VOLUME ["/home/user/app/input"]
207
+ VOLUME ["/home/user/app/logs"]
208
+ VOLUME ["/home/user/app/usage"]
209
+ VOLUME ["/home/user/app/feedback"]
210
+ VOLUME ["/home/user/app/config"]
211
+ VOLUME ["/home/user/.paddlex"]
212
+ VOLUME ["/home/user/.local/share/spacy/data"]
213
+ VOLUME ["/usr/share/tessdata"]
214
+ VOLUME ["/tmp"]
215
+ VOLUME ["/var/tmp"]
216
+
217
+ USER user
218
+
219
+ EXPOSE $GRADIO_SERVER_PORT
220
+
221
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
222
+ CMD ["python", "app.py"]
README.md ADDED
The diff for this file is too large to render. See raw diff
 
_quarto.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project:
2
+ type: website
3
+ output-dir: docs
4
+ render:
5
+ - "*.qmd"
6
+
7
+ website:
8
+ title: "Document Redaction App"
9
+ page-navigation: true
10
+ back-to-top-navigation: true
11
+ search: true
12
+ google-analytics: G-9JNEKNN14K
13
+ navbar:
14
+ left:
15
+ - href: index.qmd
16
+ text: Home
17
+ - href: src/user_guide.qmd
18
+ text: User guide
19
+ - href: src/faq.qmd
20
+ text: User FAQ
21
+ - href: src/installation_guide.qmd
22
+ text: App installation guide (with CDK)
23
+ - href: src/app_settings.qmd
24
+ text: App settings management guide
25
+ - href: src/redaction_with_vlm_and_llms.qmd
26
+ text: Redaction with local VLM and LLMs (Qwen 3)
27
+ - href: src/ocr_and_redaction_with_qwen35.qmd
28
+ text: OCR and redaction with Qwen 3.5 (Mar 2026)
29
+
30
+ format:
31
+ html:
32
+ theme: cosmo
33
+ css: styles.css
app.py ADDED
The diff for this file is too large to render. See raw diff
 
cdk/__init__.py ADDED
File without changes
cdk/app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from aws_cdk import App, Environment
4
+ from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
5
+ from cdk_functions import create_basic_config_env, load_context_from_file
6
+ from cdk_stack import CdkStack, CdkStackCloudfront # , CdkStackMain
7
+
8
+ # Assuming these are still relevant for you
9
+ from check_resources import CONTEXT_FILE, check_and_set_context
10
+
11
+ # Initialize the CDK app
12
+ app = App()
13
+
14
+ # --- ENHANCED CONTEXT GENERATION AND LOADING ---
15
+ # 1. Always ensure the old context file is removed before generation
16
+ if os.path.exists(CONTEXT_FILE):
17
+ try:
18
+ os.remove(CONTEXT_FILE)
19
+ print(f"Removed stale context file: {CONTEXT_FILE}")
20
+ except OSError as e:
21
+ print(f"Warning: Could not remove old context file {CONTEXT_FILE}: {e}")
22
+ # Proceed anyway, check_and_set_context might handle overwriting
23
+
24
+ # 2. Always run the pre-check script to generate fresh context
25
+ print("Running pre-check script to generate application context...")
26
+ try:
27
+ check_and_set_context()
28
+ if not os.path.exists(CONTEXT_FILE):
29
+ raise RuntimeError(
30
+ f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
31
+ )
32
+ print(f"Context generated successfully at {CONTEXT_FILE}.")
33
+ except Exception as e:
34
+ raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
35
+
36
+ if os.path.exists(CONTEXT_FILE):
37
+ load_context_from_file(app, CONTEXT_FILE)
38
+ else:
39
+ raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
40
+
41
+ # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
42
+ create_basic_config_env("config")
43
+
44
+ # Define the environment for the regional stack (where ALB resides)
45
+ aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
46
+
47
+ # Create the regional stack (ALB, SGs, etc.)
48
+ # regional_stack = CdkStack(app,
49
+ # "RedactionStackSubnets",
50
+ # env=aws_env_regional,
51
+ # cross_region_references=True)
52
+
53
+ # regional_stack_main = CdkStackMain(app,
54
+ # "RedactionStackMain",
55
+ # env=aws_env_regional,
56
+ # private_subnets=regional_stack.params["private_subnets"],
57
+ # private_route_tables=regional_stack.params["private_route_tables"],
58
+ # public_subnets=regional_stack.params["public_subnets"],
59
+ # public_route_tables=regional_stack.params["public_route_tables"],
60
+ # cross_region_references=True)
61
+
62
+ regional_stack = CdkStack(
63
+ app, "RedactionStack", env=aws_env_regional, cross_region_references=True
64
+ )
65
+
66
+ if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
67
+ # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
68
+ aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
69
+
70
+ # Create the CloudFront stack, passing the outputs from the regional stack
71
+ cloudfront_stack = CdkStackCloudfront(
72
+ app,
73
+ "RedactionStackCloudfront",
74
+ env=aws_env_us_east_1,
75
+ alb_arn=regional_stack.params["alb_arn_output"],
76
+ alb_sec_group_id=regional_stack.params["alb_security_group_id"],
77
+ alb_dns_name=regional_stack.params["alb_dns_name"],
78
+ cross_region_references=True,
79
+ )
80
+
81
+
82
+ # Synthesize the CloudFormation template
83
+ app.synth(validate_on_synthesis=True)
cdk/cdk_config.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ # Set or retrieve configuration variables for CDK redaction deployment
7
+
8
+
9
+ def convert_string_to_boolean(value: str) -> bool:
10
+ """Convert string to boolean, handling various formats."""
11
+ if isinstance(value, bool):
12
+ return value
13
+ elif value in ["True", "1", "true", "TRUE"]:
14
+ return True
15
+ elif value in ["False", "0", "false", "FALSE"]:
16
+ return False
17
+ else:
18
+ raise ValueError(f"Invalid boolean value: {value}")
19
+
20
+
21
+ def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
22
+ """
23
+ Get an environmental variable, and set it to a default value if it doesn't exist
24
+ """
25
+ # Get the environment variable if it exists
26
+ value = os.environ.get(var_name)
27
+
28
+ # If it doesn't exist, set the environment variable to the default value
29
+ if value is None:
30
+ os.environ[var_name] = default_value
31
+ value = default_value
32
+
33
+ if print_val is True:
34
+ print(f"The value of {var_name} is {value}")
35
+
36
+ return value
37
+
38
+
39
+ def ensure_folder_exists(output_folder: str):
40
+ """Checks if the specified folder exists, creates it if not."""
41
+
42
+ if not os.path.exists(output_folder):
43
+ # Create the folder if it doesn't exist
44
+ os.makedirs(output_folder, exist_ok=True)
45
+ print(f"Created the {output_folder} folder.")
46
+ else:
47
+ print(f"The {output_folder} folder already exists.")
48
+
49
+
50
+ def add_folder_to_path(folder_path: str):
51
+ """
52
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
53
+ """
54
+
55
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
56
+ print(folder_path, "folder exists.")
57
+
58
+ # Resolve relative path to absolute path
59
+ absolute_path = os.path.abspath(folder_path)
60
+
61
+ current_path = os.environ["PATH"]
62
+ if absolute_path not in current_path.split(os.pathsep):
63
+ full_path_extension = absolute_path + os.pathsep + current_path
64
+ os.environ["PATH"] = full_path_extension
65
+ # print(f"Updated PATH with: ", full_path_extension)
66
+ else:
67
+ print(f"Directory {folder_path} already exists in PATH.")
68
+ else:
69
+ print(f"Folder not found at {folder_path} - not added to PATH")
70
+
71
+
72
+ ###
73
+ # LOAD CONFIG FROM ENV FILE
74
+ ###
75
+ CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
76
+
77
+ ensure_folder_exists(CONFIG_FOLDER)
78
+
79
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
80
+ CDK_CONFIG_PATH = get_or_create_env_var(
81
+ "CDK_CONFIG_PATH", "config/cdk_config.env"
82
+ ) # e.g. config/cdk_config.env
83
+
84
+ if CDK_CONFIG_PATH:
85
+ if os.path.exists(CDK_CONFIG_PATH):
86
+ print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
87
+ load_dotenv(CDK_CONFIG_PATH)
88
+ else:
89
+ print("CDK config file not found at location:", CDK_CONFIG_PATH)
90
+
91
+ ###
92
+ # AWS OPTIONS
93
+ ###
94
+ AWS_REGION = get_or_create_env_var("AWS_REGION", "")
95
+ AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
96
+
97
+ ###
98
+ # CDK OPTIONS
99
+ ###
100
+ CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
101
+ CONTEXT_FILE = get_or_create_env_var(
102
+ "CONTEXT_FILE", "cdk.context.json"
103
+ ) # Define the CDK output context file name
104
+ CDK_FOLDER = get_or_create_env_var(
105
+ "CDK_FOLDER", ""
106
+ ) # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
107
+ RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
108
+
109
+ ### VPC and connections
110
+ VPC_NAME = get_or_create_env_var("VPC_NAME", "")
111
+ NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
112
+ NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "") # "10.0.0.0/24"
113
+
114
+
115
+ EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
116
+ SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
117
+
118
+ ### SUBNETS / ROUTE TABLES / NAT GATEWAY
119
+ PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
120
+ "PUBLIC_SUBNETS_TO_USE", ""
121
+ ) # e.g. ['PublicSubnet1', 'PublicSubnet2']
122
+ PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
123
+ "PUBLIC_SUBNET_CIDR_BLOCKS", ""
124
+ ) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
125
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
126
+ "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
127
+ ) # e.g. ["eu-east-1b", "eu-east1b"]
128
+
129
+ PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
130
+ "PRIVATE_SUBNETS_TO_USE", ""
131
+ ) # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
132
+ PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
133
+ "PRIVATE_SUBNET_CIDR_BLOCKS", ""
134
+ ) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
135
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
136
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
137
+ ) # e.g. ["eu-east-1b", "eu-east1b"]
138
+
139
+ ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
140
+ "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
141
+ )
142
+ NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
143
+ "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
144
+ )
145
+ NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
146
+
147
+ # IAM roles
148
+ AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
149
+ "AWS_MANAGED_TASK_ROLES_LIST",
150
+ '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs", "AmazonBedrockFullAccess"]',
151
+ )
152
+ POLICY_FILE_LOCATIONS = get_or_create_env_var(
153
+ "POLICY_FILE_LOCATIONS", ""
154
+ ) # e.g. '["config/sts_permissions.json"]'
155
+ POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
156
+
157
+ # GITHUB REPO
158
+ GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
159
+ GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
160
+ GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
161
+
162
+ ### CODEBUILD
163
+ CODEBUILD_ROLE_NAME = get_or_create_env_var(
164
+ "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
165
+ )
166
+ CODEBUILD_PROJECT_NAME = get_or_create_env_var(
167
+ "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
168
+ )
169
+
170
+ ### ECR
171
+ ECR_REPO_NAME = get_or_create_env_var(
172
+ "ECR_REPO_NAME", "doc-redaction"
173
+ ) # Beware - cannot have underscores and must be lower case
174
+ ECR_CDK_REPO_NAME = get_or_create_env_var(
175
+ "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
176
+ )
177
+
178
+ ### S3
179
+ S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
180
+ "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
181
+ ) # S3 bucket names need to be lower case
182
+ S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
183
+ "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
184
+ )
185
+
186
+ ### KMS KEYS FOR S3 AND SECRETS MANAGER
187
+ USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
188
+ CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
189
+ "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
190
+ )
191
+
192
+ ### ECS
193
+ FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
194
+ "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
195
+ )
196
+ TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
197
+ "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
198
+ )
199
+
200
+ CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
201
+ ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
202
+ ECS_TASK_ROLE_NAME = get_or_create_env_var(
203
+ "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
204
+ )
205
+ ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
206
+ "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
207
+ )
208
+ ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
209
+ "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
210
+ )
211
+ ECS_LOG_GROUP_NAME = get_or_create_env_var(
212
+ "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
213
+ )
214
+
215
+ ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
216
+ ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
217
+ ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
218
+ ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
219
+
220
+ ### Cognito
221
+ COGNITO_USER_POOL_NAME = get_or_create_env_var(
222
+ "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
223
+ )
224
+ COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
225
+ "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
226
+ )
227
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
228
+ "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
229
+ )
230
+ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
231
+ "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
232
+ ) # Should change this to something unique or you'll probably hit an error
233
+
234
+ COGNITO_REFRESH_TOKEN_VALIDITY = int(
235
+ get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
236
+ ) # Minutes
237
+ COGNITO_ID_TOKEN_VALIDITY = int(
238
+ get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
239
+ ) # Minutes
240
+ COGNITO_ACCESS_TOKEN_VALIDITY = int(
241
+ get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
242
+ ) # Minutes
243
+
244
+ # Application load balancer
245
+ ALB_NAME = get_or_create_env_var(
246
+ "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
247
+ ) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
248
+ ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
249
+ "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
250
+ )
251
+ ALB_TARGET_GROUP_NAME = get_or_create_env_var(
252
+ "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
253
+ ) # Max 32 characters
254
+ EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
255
+ EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
256
+ "EXISTING_LOAD_BALANCER_ARN", "placeholder_load_balancer_dns.net"
257
+ )
258
+
259
+ ## CLOUDFRONT
260
+ USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
261
+ CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
262
+ "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
263
+ )
264
+ CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
265
+ "CLOUDFRONT_GEO_RESTRICTION", ""
266
+ ) # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
267
+ CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
268
+ "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
269
+ )
270
+ CLOUDFRONT_DOMAIN = get_or_create_env_var(
271
+ "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
272
+ )
273
+
274
+
275
+ # Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
276
+ ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
277
+ SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
278
+ "SSL_CERTIFICATE_DOMAIN", ""
279
+ ) # e.g. example.com or www.example.com
280
+
281
+ # This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
282
+ if USE_CLOUDFRONT == "True":
283
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
284
+ "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
285
+ )
286
+ elif SSL_CERTIFICATE_DOMAIN:
287
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
288
+ "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
289
+ )
290
+ else:
291
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
292
+ "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
293
+ )
294
+
295
+ # Custom headers e.g. if routing traffic through Cloudfront
296
+ CUSTOM_HEADER = get_or_create_env_var(
297
+ "CUSTOM_HEADER", ""
298
+ ) # Retrieving or setting CUSTOM_HEADER
299
+ CUSTOM_HEADER_VALUE = get_or_create_env_var(
300
+ "CUSTOM_HEADER_VALUE", ""
301
+ ) # Retrieving or setting CUSTOM_HEADER_VALUE
302
+
303
+ # Firewall on top of load balancer
304
+ LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
305
+ "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
306
+ )
307
+
308
+ # Firewall on top of CloudFront
309
+ WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
310
+
311
+ ###
312
+ # File I/O options
313
+ ###
314
+
315
+ OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") # 'output/'
316
+ INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") # 'input/'
317
+
318
+ # Allow for files to be saved in a temporary folder for increased security in some instances
319
+ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
320
+ # Create a temporary directory
321
+ with tempfile.TemporaryDirectory() as temp_dir:
322
+ print(f"Temporary directory created at: {temp_dir}")
323
+
324
+ if OUTPUT_FOLDER == "TEMP":
325
+ OUTPUT_FOLDER = temp_dir + "/"
326
+ if INPUT_FOLDER == "TEMP":
327
+ INPUT_FOLDER = temp_dir + "/"
328
+
329
+ ###
330
+ # LOGGING OPTIONS
331
+ ###
332
+
333
+ SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
334
+
335
+ ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
336
+ SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
337
+ ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
338
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
339
+ )
340
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
341
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
342
+ )
343
+ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
344
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
345
+ )
346
+
347
+ ###
348
+ # REDACTION OPTIONS
349
+ ###
350
+
351
+ # Get some environment variables and Launch the Gradio app
352
+ COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
353
+
354
+ GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
355
+
356
+ ###
357
+ # WHOLE DOCUMENT API OPTIONS
358
+ ###
359
+
360
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
361
+ "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
362
+ ) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
cdk/cdk_functions.py ADDED
@@ -0,0 +1,1482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ipaddress
2
+ import json
3
+ import os
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import boto3
7
+ import pandas as pd
8
+ from aws_cdk import App, CfnOutput, CfnTag, Tags
9
+ from aws_cdk import aws_cognito as cognito
10
+ from aws_cdk import aws_ec2 as ec2
11
+ from aws_cdk import aws_elasticloadbalancingv2 as elb
12
+ from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
13
+ from aws_cdk import aws_iam as iam
14
+ from aws_cdk import aws_wafv2 as wafv2
15
+ from botocore.exceptions import ClientError
16
+ from cdk_config import (
17
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
18
+ AWS_REGION,
19
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
20
+ NAT_GATEWAY_EIP_NAME,
21
+ POLICY_FILE_LOCATIONS,
22
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
23
+ PRIVATE_SUBNET_CIDR_BLOCKS,
24
+ PRIVATE_SUBNETS_TO_USE,
25
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
26
+ PUBLIC_SUBNET_CIDR_BLOCKS,
27
+ PUBLIC_SUBNETS_TO_USE,
28
+ S3_LOG_CONFIG_BUCKET_NAME,
29
+ S3_OUTPUT_BUCKET_NAME,
30
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
31
+ )
32
+ from constructs import Construct
33
+ from dotenv import set_key
34
+
35
+
36
+ # --- Function to load context from file ---
37
+ def load_context_from_file(app: App, file_path: str):
38
+ if os.path.exists(file_path):
39
+ with open(file_path, "r") as f:
40
+ context_data = json.load(f)
41
+ for key, value in context_data.items():
42
+ app.node.set_context(key, value)
43
+ print(f"Loaded context from {file_path}")
44
+ else:
45
+ print(f"Context file not found: {file_path}")
46
+
47
+
48
+ # --- Helper to parse environment variables into lists ---
49
+ def _get_env_list(env_var_name: str) -> List[str]:
50
+ """Parses a comma-separated environment variable into a list of strings."""
51
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
52
+ if not value:
53
+ return []
54
+ # Split by comma and filter out any empty strings that might result from extra commas
55
+ return [s.strip() for s in value.split(",") if s.strip()]
56
+
57
+
58
+ # 1. Try to load CIDR/AZs from environment variables
59
+ if PUBLIC_SUBNETS_TO_USE:
60
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
61
+ if PRIVATE_SUBNETS_TO_USE:
62
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
63
+
64
+ if PUBLIC_SUBNET_CIDR_BLOCKS:
65
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
66
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES:
67
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
68
+ if PRIVATE_SUBNET_CIDR_BLOCKS:
69
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
70
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES:
71
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
72
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES"
73
+ )
74
+
75
+ if POLICY_FILE_LOCATIONS:
76
+ POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
77
+
78
+
79
+ def check_for_existing_role(role_name: str):
80
+ try:
81
+ iam = boto3.client("iam")
82
+ # iam.get_role(RoleName=role_name)
83
+
84
+ response = iam.get_role(RoleName=role_name)
85
+ role = response["Role"]["Arn"]
86
+
87
+ print("Response Role:", role)
88
+
89
+ return True, role, ""
90
+ except iam.exceptions.NoSuchEntityException:
91
+ return False, "", ""
92
+ except Exception as e:
93
+ raise Exception("Getting information on IAM role failed due to:", e)
94
+
95
+
96
+ from typing import List
97
+
98
+ # Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
99
+ # For example:
100
+ # POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
101
+
102
+
103
+ def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
104
+ """
105
+ Adds individual policy statements from a parsed policy document to a CDK Role.
106
+
107
+ Args:
108
+ role: The CDK Role construct to attach policies to.
109
+ policy_document: A Python dictionary representing an IAM policy document.
110
+ """
111
+ # Ensure the loaded JSON is a valid policy document structure
112
+ if "Statement" not in policy_document or not isinstance(
113
+ policy_document["Statement"], list
114
+ ):
115
+ print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
116
+ return # Do not return role, just log and exit
117
+
118
+ for statement_dict in policy_document["Statement"]:
119
+ try:
120
+ # Create a CDK PolicyStatement from the dictionary
121
+ cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
122
+
123
+ # Add the policy statement to the role
124
+ role.add_to_policy(cdk_policy_statement)
125
+ print(f" - Added statement: {statement_dict.get('Sid', 'No Sid')}")
126
+ except Exception as e:
127
+ print(
128
+ f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
129
+ )
130
+
131
+
132
+ def add_custom_policies(
133
+ scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
134
+ role: iam.IRole,
135
+ policy_file_locations: Optional[List[str]] = None,
136
+ custom_policy_text: Optional[str] = None,
137
+ ) -> iam.IRole:
138
+ """
139
+ Loads custom policies from JSON files or a string and attaches them to a CDK Role.
140
+
141
+ Args:
142
+ scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
143
+ role: The CDK Role construct to attach policies to.
144
+ policy_file_locations: List of file paths to JSON policy documents.
145
+ custom_policy_text: A JSON string representing a policy document.
146
+
147
+ Returns:
148
+ The modified CDK Role construct.
149
+ """
150
+ if policy_file_locations is None:
151
+ policy_file_locations = []
152
+
153
+ current_source = "unknown source" # For error messages
154
+
155
+ try:
156
+ if policy_file_locations:
157
+ print(f"Attempting to add policies from files to role {role.node.id}...")
158
+ for path in policy_file_locations:
159
+ current_source = f"file: {path}"
160
+ try:
161
+ with open(path, "r") as f:
162
+ policy_document = json.load(f)
163
+ print(f"Processing policy from {current_source}...")
164
+ add_statement_to_policy(role, policy_document)
165
+ except FileNotFoundError:
166
+ print(f"Warning: Policy file not found at {path}. Skipping.")
167
+ except json.JSONDecodeError as e:
168
+ print(
169
+ f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
170
+ )
171
+ except Exception as e:
172
+ print(
173
+ f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
174
+ )
175
+
176
+ if custom_policy_text:
177
+ current_source = "custom policy text string"
178
+ print(
179
+ f"Attempting to add policy from custom text to role {role.node.id}..."
180
+ )
181
+ try:
182
+ # *** FIX: Parse the JSON string into a Python dictionary ***
183
+ policy_document = json.loads(custom_policy_text)
184
+ print(f"Processing policy from {current_source}...")
185
+ add_statement_to_policy(role, policy_document)
186
+ except json.JSONDecodeError as e:
187
+ print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
188
+ except Exception as e:
189
+ print(
190
+ f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
191
+ )
192
+
193
+ # You might want a final success message, but individual processing messages are also good.
194
+ print(f"Finished processing custom policies for role {role.node.id}.")
195
+
196
+ except Exception as e:
197
+ print(
198
+ f"An unhandled error occurred during policy addition for {current_source}: {e}"
199
+ )
200
+
201
+ return role
202
+
203
+
204
+ # Import the S3 Bucket class if you intend to return a CDK object later
205
+ # from aws_cdk import aws_s3 as s3
206
+
207
+
208
+ def check_s3_bucket_exists(
209
+ bucket_name: str,
210
+ ): # Return type hint depends on what you return
211
+ """
212
+ Checks if an S3 bucket with the given name exists and is accessible.
213
+
214
+ Args:
215
+ bucket_name: The name of the S3 bucket to check.
216
+
217
+ Returns:
218
+ A tuple: (bool indicating existence, optional S3 Bucket object or None)
219
+ Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
220
+ for direct use in CDK. You'll likely only need the boolean result
221
+ or the bucket name for CDK lookups/creations.
222
+ For this example, let's return the boolean and the name.
223
+ """
224
+ s3_client = boto3.client("s3")
225
+ try:
226
+ # Use head_bucket to check for existence and access
227
+ s3_client.head_bucket(Bucket=bucket_name)
228
+ print(f"Bucket '{bucket_name}' exists and is accessible.")
229
+ return True, bucket_name # Return True and the bucket name
230
+
231
+ except ClientError as e:
232
+ # If a ClientError occurs, check the error code.
233
+ # '404' means the bucket does not exist.
234
+ # '403' means the bucket exists but you don't have permission.
235
+ error_code = e.response["Error"]["Code"]
236
+ if error_code == "404":
237
+ print(f"Bucket '{bucket_name}' does not exist.")
238
+ return False, None
239
+ elif error_code == "403":
240
+ # The bucket exists, but you can't access it.
241
+ # Depending on your requirements, this might be treated as "exists"
242
+ # or "not accessible for our purpose". For checking existence,
243
+ # we'll say it exists here, but note the permission issue.
244
+ # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
245
+ print(
246
+ f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
247
+ )
248
+ return False, bucket_name # It exists, even if not accessible
249
+ else:
250
+ # For other errors, it's better to raise the exception
251
+ # to indicate something unexpected happened.
252
+ print(
253
+ f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
254
+ )
255
+ # Decide how to handle other errors - raising might be safer
256
+ raise # Re-raise the original exception
257
+ except Exception as e:
258
+ print(
259
+ f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
260
+ )
261
+ # Decide how to handle other errors
262
+ raise # Re-raise the original exception
263
+
264
+
265
+ # Example usage in your check_resources.py:
266
+ # exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
267
+ # context_data[f"exists:{log_bucket_name}"] = exists
268
+ # # You don't necessarily need to store the name in context if using from_bucket_name
269
+
270
+
271
+ # Delete an S3 bucket
272
+ def delete_s3_bucket(bucket_name: str):
273
+ s3 = boto3.client("s3")
274
+
275
+ try:
276
+ # List and delete all objects
277
+ response = s3.list_object_versions(Bucket=bucket_name)
278
+ versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
279
+ for version in versions:
280
+ s3.delete_object(
281
+ Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
282
+ )
283
+
284
+ # Delete the bucket
285
+ s3.delete_bucket(Bucket=bucket_name)
286
+ return {"Status": "SUCCESS"}
287
+ except Exception as e:
288
+ return {"Status": "FAILED", "Reason": str(e)}
289
+
290
+
291
+ # Function to get subnet ID from subnet name
292
+ def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
293
+ response = ec2_client.describe_subnets(
294
+ Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
295
+ )
296
+
297
+ for subnet in response["Subnets"]:
298
+ if subnet["Tags"] and any(
299
+ tag["Key"] == "Name" and tag["Value"] == subnet_name
300
+ for tag in subnet["Tags"]
301
+ ):
302
+ return subnet["SubnetId"]
303
+
304
+ return None
305
+
306
+
307
+ def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
308
+ """
309
+ Checks if an ECR repository with the given name exists.
310
+
311
+ Args:
312
+ repo_name: The name of the ECR repository to check.
313
+
314
+ Returns:
315
+ True if the repository exists, False otherwise.
316
+ """
317
+ ecr_client = boto3.client("ecr")
318
+ try:
319
+ print("ecr repo_name to check:", repo_name)
320
+ response = ecr_client.describe_repositories(repositoryNames=[repo_name])
321
+ # If describe_repositories succeeds and returns a list of repositories,
322
+ # and the list is not empty, the repository exists.
323
+ return len(response["repositories"]) > 0, response["repositories"][0]
324
+ except ClientError as e:
325
+ # Check for the specific error code indicating the repository doesn't exist
326
+ if e.response["Error"]["Code"] == "RepositoryNotFoundException":
327
+ return False, {}
328
+ else:
329
+ # Re-raise other exceptions to handle unexpected errors
330
+ raise
331
+ except Exception as e:
332
+ print(f"An unexpected error occurred: {e}")
333
+ return False, {}
334
+
335
+
336
+ def check_codebuild_project_exists(
337
+ project_name: str,
338
+ ): # Adjust return type hint as needed
339
+ """
340
+ Checks if a CodeBuild project with the given name exists.
341
+
342
+ Args:
343
+ project_name: The name of the CodeBuild project to check.
344
+
345
+ Returns:
346
+ A tuple:
347
+ - The first element is True if the project exists, False otherwise.
348
+ - The second element is the project object (dictionary) if found,
349
+ None otherwise.
350
+ """
351
+ codebuild_client = boto3.client("codebuild")
352
+ try:
353
+ # Use batch_get_projects with a list containing the single project name
354
+ response = codebuild_client.batch_get_projects(names=[project_name])
355
+
356
+ # The response for batch_get_projects includes 'projects' (found)
357
+ # and 'projectsNotFound' (not found).
358
+ if response["projects"]:
359
+ # If the project is found in the 'projects' list
360
+ print(f"CodeBuild project '{project_name}' found.")
361
+ return (
362
+ True,
363
+ response["projects"][0]["arn"],
364
+ ) # Return True and the project details dict
365
+ elif (
366
+ response["projectsNotFound"]
367
+ and project_name in response["projectsNotFound"]
368
+ ):
369
+ # If the project name is explicitly in the 'projectsNotFound' list
370
+ print(f"CodeBuild project '{project_name}' not found.")
371
+ return False, None
372
+ else:
373
+ # This case is less expected for a single name lookup,
374
+ # but could happen if there's an internal issue or the response
375
+ # structure is slightly different than expected for an error.
376
+ # It's safer to assume it wasn't found if not in 'projects'.
377
+ print(
378
+ f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
379
+ )
380
+ return False, None
381
+
382
+ except ClientError as e:
383
+ # Catch specific ClientErrors. batch_get_projects might not throw
384
+ # 'InvalidInputException' for a non-existent project name if the
385
+ # name format is valid. It typically just lists it in projectsNotFound.
386
+ # However, other ClientErrors are possible (e.g., permissions).
387
+ print(
388
+ f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
389
+ )
390
+ # Decide how to handle other ClientErrors - raising might be safer
391
+ raise # Re-raise the original exception
392
+ except Exception as e:
393
+ print(
394
+ f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
395
+ )
396
+ # Decide how to handle other errors
397
+ raise # Re-raise the original exception
398
+
399
+
400
+ def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
401
+ """
402
+ Finds a VPC ID by its 'Name' tag.
403
+ """
404
+ ec2_client = boto3.client("ec2")
405
+ try:
406
+ response = ec2_client.describe_vpcs(
407
+ Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
408
+ )
409
+ if response and response["Vpcs"]:
410
+ vpc_id = response["Vpcs"][0]["VpcId"]
411
+ print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
412
+
413
+ # In get_vpc_id_by_name, after finding VPC ID:
414
+
415
+ # Look for NAT Gateways in this VPC
416
+ ec2_client = boto3.client("ec2")
417
+ nat_gateways = []
418
+ try:
419
+ response = ec2_client.describe_nat_gateways(
420
+ Filters=[
421
+ {"Name": "vpc-id", "Values": [vpc_id]},
422
+ # Optional: Add a tag filter if you consistently tag your NATs
423
+ # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
424
+ ]
425
+ )
426
+ nat_gateways = response.get("NatGateways", [])
427
+ except Exception as e:
428
+ print(
429
+ f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
430
+ )
431
+ # Decide how to handle this error - proceed or raise?
432
+
433
+ # Decide how to identify the specific NAT Gateway you want to check for.
434
+
435
+ return vpc_id, nat_gateways
436
+ else:
437
+ print(f"VPC '{vpc_name}' not found.")
438
+ return None
439
+ except Exception as e:
440
+ print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
441
+ raise
442
+
443
+
444
+ # --- Helper to fetch all existing subnets in a VPC once ---
445
+ def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
446
+ """
447
+ Fetches all subnets in a given VPC.
448
+ Returns a dictionary with 'by_name' (map of name to subnet data),
449
+ 'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
450
+ """
451
+ ec2_client = boto3.client("ec2")
452
+ existing_subnets_data = {
453
+ "by_name": {}, # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
454
+ "by_id": {}, # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
455
+ "cidr_networks": [], # List of ipaddress.IPv4Network objects
456
+ }
457
+ try:
458
+ response = ec2_client.describe_subnets(
459
+ Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
460
+ )
461
+ for s in response.get("Subnets", []):
462
+ subnet_id = s["SubnetId"]
463
+ cidr_block = s.get("CidrBlock")
464
+ # Extract 'Name' tag, which is crucial for lookup by name
465
+ name_tag = next(
466
+ (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
467
+ None,
468
+ )
469
+
470
+ subnet_info = {"id": subnet_id, "cidr": cidr_block, "name": name_tag}
471
+
472
+ if name_tag:
473
+ existing_subnets_data["by_name"][name_tag] = subnet_info
474
+ existing_subnets_data["by_id"][subnet_id] = subnet_info
475
+
476
+ if cidr_block:
477
+ try:
478
+ existing_subnets_data["cidr_networks"].append(
479
+ ipaddress.ip_network(cidr_block, strict=False)
480
+ )
481
+ except ValueError:
482
+ print(
483
+ f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
484
+ )
485
+
486
+ print(
487
+ f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
488
+ )
489
+ except Exception as e:
490
+ print(
491
+ f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
492
+ )
493
+ raise # Re-raise if this essential step fails
494
+
495
+ return existing_subnets_data
496
+
497
+
498
+ # --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
499
+ def validate_subnet_creation_parameters(
500
+ vpc_id: str,
501
+ proposed_subnets_data: List[
502
+ Dict[str, str]
503
+ ], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
504
+ existing_aws_subnets_data: Dict[
505
+ str, Any
506
+ ], # Pre-fetched data from _get_existing_subnets_in_vpc
507
+ ) -> None:
508
+ """
509
+ Validates proposed subnet names and CIDR blocks against existing AWS subnets
510
+ in the specified VPC and against each other.
511
+ This function uses pre-fetched AWS subnet data.
512
+
513
+ Args:
514
+ vpc_id: The ID of the VPC (for logging/error messages).
515
+ proposed_subnets_data: A list of dictionaries, where each dict represents
516
+ a proposed subnet with 'name', 'cidr', and 'az'.
517
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
518
+ (e.g., from _get_existing_subnets_in_vpc).
519
+
520
+ Raises:
521
+ ValueError: If any proposed subnet name or CIDR block
522
+ conflicts with existing AWS resources or other proposed resources.
523
+ """
524
+ if not proposed_subnets_data:
525
+ print("No proposed subnet data provided for validation. Skipping.")
526
+ return
527
+
528
+ print(
529
+ f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
530
+ )
531
+
532
+ print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
533
+
534
+ existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
535
+ existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
536
+
537
+ # Sets to track names and list to track networks for internal batch consistency
538
+ proposed_names_seen: set[str] = set()
539
+ proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
540
+
541
+ for i, proposed_subnet in enumerate(proposed_subnets_data):
542
+ subnet_name = proposed_subnet.get("name")
543
+ cidr_block_str = proposed_subnet.get("cidr")
544
+ availability_zone = proposed_subnet.get("az")
545
+
546
+ if not all([subnet_name, cidr_block_str, availability_zone]):
547
+ raise ValueError(
548
+ f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
549
+ )
550
+
551
+ # 1. Check for duplicate names within the proposed batch
552
+ if subnet_name in proposed_names_seen:
553
+ raise ValueError(
554
+ f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
555
+ )
556
+ proposed_names_seen.add(subnet_name)
557
+
558
+ # 2. Check for duplicate names against existing AWS subnets
559
+ if subnet_name in existing_aws_subnet_names:
560
+ print(
561
+ f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
562
+ )
563
+
564
+ # Parse proposed CIDR
565
+ try:
566
+ proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
567
+ except ValueError as e:
568
+ raise ValueError(
569
+ f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
570
+ )
571
+
572
+ # 3. Check for overlapping CIDRs within the proposed batch
573
+ for existing_proposed_net in proposed_cidr_networks_seen:
574
+ if proposed_net.overlaps(existing_proposed_net):
575
+ raise ValueError(
576
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
577
+ f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
578
+ f"within the same batch."
579
+ )
580
+
581
+ # 4. Check for overlapping CIDRs against existing AWS subnets
582
+ for existing_aws_net in existing_aws_cidr_networks:
583
+ if proposed_net.overlaps(existing_aws_net):
584
+ raise ValueError(
585
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
586
+ f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
587
+ f"in VPC '{vpc_id}'."
588
+ )
589
+
590
+ # If all checks pass for this subnet, add its network to the list for subsequent checks
591
+ proposed_cidr_networks_seen.append(proposed_net)
592
+ print(
593
+ f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
594
+ )
595
+
596
+ print(
597
+ f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
598
+ )
599
+
600
+
601
+ # --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
602
+ def check_subnet_exists_by_name(
603
+ subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
604
+ ) -> Tuple[bool, Optional[str]]:
605
+ """
606
+ Checks if a subnet with the given name exists within the pre-fetched data.
607
+
608
+ Args:
609
+ subnet_name: The 'Name' tag value of the subnet to check.
610
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
611
+ (e.g., from _get_existing_subnets_in_vpc).
612
+
613
+ Returns:
614
+ A tuple:
615
+ - The first element is True if the subnet exists, False otherwise.
616
+ - The second element is the Subnet ID if found, None otherwise.
617
+ """
618
+ subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
619
+ if subnet_info:
620
+ print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
621
+ return True, subnet_info["id"]
622
+ else:
623
+ print(f"Subnet '{subnet_name}' not found.")
624
+ return False, None
625
+
626
+
627
+ def create_nat_gateway(
628
+ scope: Construct,
629
+ public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
630
+ nat_gateway_name: str,
631
+ nat_gateway_id_context_key: str,
632
+ ) -> str:
633
+ """
634
+ Creates a single NAT Gateway in the specified public subnet.
635
+ It does not handle lookup from context; the calling stack should do that.
636
+ Returns the CloudFormation Ref of the NAT Gateway ID.
637
+ """
638
+ print(
639
+ f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
640
+ )
641
+
642
+ # Create an Elastic IP for the NAT Gateway
643
+ eip = ec2.CfnEIP(
644
+ scope,
645
+ NAT_GATEWAY_EIP_NAME,
646
+ tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
647
+ )
648
+
649
+ # Create the NAT Gateway
650
+ nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
651
+ nat_gateway = ec2.CfnNatGateway(
652
+ scope,
653
+ nat_gateway_logical_id,
654
+ subnet_id=public_subnet_for_nat.subnet_id, # Associate with the public subnet
655
+ allocation_id=eip.attr_allocation_id, # Associate with the EIP
656
+ tags=[CfnTag(key="Name", value=nat_gateway_name)],
657
+ )
658
+ # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
659
+ nat_gateway.add_dependency(eip)
660
+
661
+ # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
662
+ # This is how you will get the ID to put into cdk.context.json
663
+ CfnOutput(
664
+ scope,
665
+ "SingleNatGatewayIdOutput",
666
+ value=nat_gateway.ref,
667
+ description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
668
+ export_name=f"{scope.stack_name}-NatGatewayId", # Make export name unique
669
+ )
670
+
671
+ print(
672
+ f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
673
+ )
674
+ # Return the tokenised reference for use within this synthesis
675
+ return nat_gateway.ref
676
+
677
+
678
+ def create_subnets(
679
+ scope: Construct,
680
+ vpc: ec2.IVpc,
681
+ prefix: str,
682
+ subnet_names: List[str],
683
+ cidr_blocks: List[str],
684
+ availability_zones: List[str],
685
+ is_public: bool,
686
+ internet_gateway_id: Optional[str] = None,
687
+ single_nat_gateway_id: Optional[str] = None,
688
+ ) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
689
+ """
690
+ Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
691
+ for backward compatibility.
692
+ """
693
+ # --- Validations remain the same ---
694
+ if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
695
+ raise ValueError(
696
+ "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
697
+ )
698
+ if is_public and not internet_gateway_id:
699
+ raise ValueError("internet_gateway_id must be provided for public subnets.")
700
+ if not is_public and not single_nat_gateway_id:
701
+ raise ValueError(
702
+ "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
703
+ )
704
+
705
+ # --- We will populate these lists with the L1 objects to return ---
706
+ created_subnets: List[ec2.CfnSubnet] = []
707
+ created_route_tables: List[ec2.CfnRouteTable] = []
708
+
709
+ subnet_type_tag = "public" if is_public else "private"
710
+
711
+ for i, subnet_name in enumerate(subnet_names):
712
+ logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
713
+
714
+ # 1. Create the L2 Subnet (this is the easy part)
715
+ subnet = ec2.Subnet(
716
+ scope,
717
+ logical_id,
718
+ vpc_id=vpc.vpc_id,
719
+ cidr_block=cidr_blocks[i],
720
+ availability_zone=availability_zones[i],
721
+ map_public_ip_on_launch=is_public,
722
+ )
723
+ Tags.of(subnet).add("Name", subnet_name)
724
+ Tags.of(subnet).add("Type", subnet_type_tag)
725
+
726
+ if is_public:
727
+ # The subnet's route_table is automatically created by the L2 Subnet construct
728
+ try:
729
+ subnet.add_route(
730
+ "DefaultInternetRoute", # A logical ID for the CfnRoute resource
731
+ router_id=internet_gateway_id,
732
+ router_type=ec2.RouterType.GATEWAY,
733
+ # destination_cidr_block="0.0.0.0/0" is the default for this method
734
+ )
735
+ except Exception as e:
736
+ print("Could not create IGW route for public subnet due to:", e)
737
+ print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
738
+ else:
739
+ try:
740
+ # Using .add_route() for private subnets as well for consistency
741
+ subnet.add_route(
742
+ "DefaultNatRoute", # A logical ID for the CfnRoute resource
743
+ router_id=single_nat_gateway_id,
744
+ router_type=ec2.RouterType.NAT_GATEWAY,
745
+ )
746
+ except Exception as e:
747
+ print("Could not create NAT gateway route for public subnet due to:", e)
748
+ print(
749
+ f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
750
+ )
751
+
752
+ route_table = subnet.route_table
753
+
754
+ created_subnets.append(subnet)
755
+ created_route_tables.append(route_table)
756
+
757
+ return created_subnets, created_route_tables
758
+
759
+
760
+ def ingress_rule_exists(security_group: str, peer: str, port: str):
761
+ for rule in security_group.connections.security_groups:
762
+ if port:
763
+ if rule.peer == peer and rule.connection == port:
764
+ return True
765
+ else:
766
+ if rule.peer == peer:
767
+ return True
768
+ return False
769
+
770
+
771
+ def check_for_existing_user_pool(user_pool_name: str):
772
+ cognito_client = boto3.client("cognito-idp")
773
+ list_pools_response = cognito_client.list_user_pools(
774
+ MaxResults=60
775
+ ) # MaxResults up to 60
776
+
777
+ # ListUserPools might require pagination if you have more than 60 pools
778
+ # This simple example doesn't handle pagination, which could miss your pool
779
+
780
+ existing_user_pool_id = ""
781
+
782
+ for pool in list_pools_response.get("UserPools", []):
783
+ if pool.get("Name") == user_pool_name:
784
+ existing_user_pool_id = pool["Id"]
785
+ print(
786
+ f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
787
+ )
788
+ break # Found the one we're looking for
789
+
790
+ if existing_user_pool_id:
791
+ return True, existing_user_pool_id, pool
792
+ else:
793
+ return False, "", ""
794
+
795
+
796
+ def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
797
+ """
798
+ Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
799
+
800
+ Args:
801
+ user_pool_id: The ID of the Cognito User Pool.
802
+ user_pool_client_name: The name of the User Pool Client to check for.
803
+
804
+ Returns:
805
+ A tuple:
806
+ - True, client_id, client_details if the client exists.
807
+ - False, "", {} otherwise.
808
+ """
809
+ cognito_client = boto3.client("cognito-idp")
810
+ next_token = "string"
811
+
812
+ while True:
813
+ try:
814
+ response = cognito_client.list_user_pool_clients(
815
+ UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
816
+ )
817
+ except cognito_client.exceptions.ResourceNotFoundException:
818
+ print(f"Error: User pool with ID '{user_pool_id}' not found.")
819
+ return False, "", {}
820
+
821
+ except cognito_client.exceptions.InvalidParameterException:
822
+ print(f"Error: No app clients for '{user_pool_id}' found.")
823
+ return False, "", {}
824
+
825
+ except Exception as e:
826
+ print("Could not check User Pool clients due to:", e)
827
+
828
+ for client in response.get("UserPoolClients", []):
829
+ if client.get("ClientName") == user_pool_client_name:
830
+ print(
831
+ f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
832
+ )
833
+ return True, client["ClientId"], client
834
+
835
+ next_token = response.get("NextToken")
836
+ if not next_token:
837
+ break
838
+
839
+ return False, "", {}
840
+
841
+
842
+ def check_for_secret(secret_name: str, secret_value: dict = ""):
843
+ """
844
+ Checks if a Secrets Manager secret with the given name exists.
845
+ If it doesn't exist, it creates the secret.
846
+
847
+ Args:
848
+ secret_name: The name of the Secrets Manager secret.
849
+ secret_value: A dictionary containing the key-value pairs for the secret.
850
+
851
+ Returns:
852
+ True if the secret existed or was created, False otherwise (due to other errors).
853
+ """
854
+ secretsmanager_client = boto3.client("secretsmanager")
855
+
856
+ try:
857
+ # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
858
+ secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
859
+ print("Secret already exists.")
860
+ return True, secret_value
861
+ except secretsmanager_client.exceptions.ResourceNotFoundException:
862
+ print("Secret not found")
863
+ return False, {}
864
+ except Exception as e:
865
+ # Handle other potential exceptions during the get operation
866
+ print(f"Error checking for secret: {e}")
867
+ return False, {}
868
+
869
+
870
+ def check_alb_exists(
871
+ load_balancer_name: str, region_name: str = None
872
+ ) -> tuple[bool, dict]:
873
+ """
874
+ Checks if an Application Load Balancer (ALB) with the given name exists.
875
+
876
+ Args:
877
+ load_balancer_name: The name of the ALB to check.
878
+ region_name: The AWS region to check in. If None, uses the default
879
+ session region.
880
+
881
+ Returns:
882
+ A tuple:
883
+ - The first element is True if the ALB exists, False otherwise.
884
+ - The second element is the ALB object (dictionary) if found,
885
+ None otherwise. Specifically, it returns the first element of
886
+ the LoadBalancers list from the describe_load_balancers response.
887
+ """
888
+ if region_name:
889
+ elbv2_client = boto3.client("elbv2", region_name=region_name)
890
+ else:
891
+ elbv2_client = boto3.client("elbv2")
892
+ try:
893
+ response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
894
+ if response["LoadBalancers"]:
895
+ return (
896
+ True,
897
+ response["LoadBalancers"][0],
898
+ ) # Return True and the first ALB object
899
+ else:
900
+ return False, {}
901
+ except ClientError as e:
902
+ # If the error indicates the ALB doesn't exist, return False
903
+ if e.response["Error"]["Code"] == "LoadBalancerNotFound":
904
+ return False, {}
905
+ else:
906
+ # Re-raise other exceptions
907
+ raise
908
+ except Exception as e:
909
+ print(f"An unexpected error occurred: {e}")
910
+ return False, {}
911
+
912
+
913
+ def check_fargate_task_definition_exists(
914
+ task_definition_name: str, region_name: str = None
915
+ ) -> tuple[bool, dict]:
916
+ """
917
+ Checks if a Fargate task definition with the given name exists.
918
+
919
+ Args:
920
+ task_definition_name: The name or ARN of the task definition to check.
921
+ region_name: The AWS region to check in. If None, uses the default
922
+ session region.
923
+
924
+ Returns:
925
+ A tuple:
926
+ - The first element is True if the task definition exists, False otherwise.
927
+ - The second element is the task definition object (dictionary) if found,
928
+ None otherwise. Specifically, it returns the first element of the
929
+ taskDefinitions list from the describe_task_definition response.
930
+ """
931
+ if region_name:
932
+ ecs_client = boto3.client("ecs", region_name=region_name)
933
+ else:
934
+ ecs_client = boto3.client("ecs")
935
+ try:
936
+ response = ecs_client.describe_task_definition(
937
+ taskDefinition=task_definition_name
938
+ )
939
+ # If describe_task_definition succeeds, it returns the task definition.
940
+ # We can directly return True and the task definition.
941
+ return True, response["taskDefinition"]
942
+ except ClientError as e:
943
+ # Check for the error code indicating the task definition doesn't exist.
944
+ if (
945
+ e.response["Error"]["Code"] == "ClientException"
946
+ and "Task definition" in e.response["Message"]
947
+ and "does not exist" in e.response["Message"]
948
+ ):
949
+ return False, {}
950
+ else:
951
+ # Re-raise other exceptions.
952
+ raise
953
+ except Exception as e:
954
+ print(f"An unexpected error occurred: {e}")
955
+ return False, {}
956
+
957
+
958
+ def check_ecs_service_exists(
959
+ cluster_name: str, service_name: str, region_name: str = None
960
+ ) -> tuple[bool, dict]:
961
+ """
962
+ Checks if an ECS service with the given name exists in the specified cluster.
963
+
964
+ Args:
965
+ cluster_name: The name or ARN of the ECS cluster.
966
+ service_name: The name of the ECS service to check.
967
+ region_name: The AWS region to check in. If None, uses the default
968
+ session region.
969
+
970
+ Returns:
971
+ A tuple:
972
+ - The first element is True if the service exists, False otherwise.
973
+ - The second element is the service object (dictionary) if found,
974
+ None otherwise.
975
+ """
976
+ if region_name:
977
+ ecs_client = boto3.client("ecs", region_name=region_name)
978
+ else:
979
+ ecs_client = boto3.client("ecs")
980
+ try:
981
+ response = ecs_client.describe_services(
982
+ cluster=cluster_name, services=[service_name]
983
+ )
984
+ if response["services"]:
985
+ return (
986
+ True,
987
+ response["services"][0],
988
+ ) # Return True and the first service object
989
+ else:
990
+ return False, {}
991
+ except ClientError as e:
992
+ # Check for the error code indicating the service doesn't exist.
993
+ if e.response["Error"]["Code"] == "ClusterNotFoundException":
994
+ return False, {}
995
+ elif e.response["Error"]["Code"] == "ServiceNotFoundException":
996
+ return False, {}
997
+ else:
998
+ # Re-raise other exceptions.
999
+ raise
1000
+ except Exception as e:
1001
+ print(f"An unexpected error occurred: {e}")
1002
+ return False, {}
1003
+
1004
+
1005
+ def check_cloudfront_distribution_exists(
1006
+ distribution_name: str, region_name: str = None
1007
+ ) -> tuple[bool, dict | None]:
1008
+ """
1009
+ Checks if a CloudFront distribution with the given name exists.
1010
+
1011
+ Args:
1012
+ distribution_name: The name of the CloudFront distribution to check.
1013
+ region_name: The AWS region to check in. If None, uses the default
1014
+ session region. Note: CloudFront is a global service,
1015
+ so the region is usually 'us-east-1', but this parameter
1016
+ is included for completeness.
1017
+
1018
+ Returns:
1019
+ A tuple:
1020
+ - The first element is True if the distribution exists, False otherwise.
1021
+ - The second element is the distribution object (dictionary) if found,
1022
+ None otherwise. Specifically, it returns the first element of the
1023
+ DistributionList from the ListDistributions response.
1024
+ """
1025
+ if region_name:
1026
+ cf_client = boto3.client("cloudfront", region_name=region_name)
1027
+ else:
1028
+ cf_client = boto3.client("cloudfront")
1029
+ try:
1030
+ response = cf_client.list_distributions()
1031
+ if "Items" in response["DistributionList"]:
1032
+ for distribution in response["DistributionList"]["Items"]:
1033
+ # CloudFront doesn't directly filter by name, so we have to iterate.
1034
+ if (
1035
+ distribution["AliasSet"]["Items"]
1036
+ and distribution["AliasSet"]["Items"][0] == distribution_name
1037
+ ):
1038
+ return True, distribution
1039
+ return False, None
1040
+ else:
1041
+ return False, None
1042
+ except ClientError as e:
1043
+ # If the error indicates the Distribution doesn't exist, return False
1044
+ if e.response["Error"]["Code"] == "NoSuchDistribution":
1045
+ return False, None
1046
+ else:
1047
+ # Re-raise other exceptions
1048
+ raise
1049
+ except Exception as e:
1050
+ print(f"An unexpected error occurred: {e}")
1051
+ return False, None
1052
+
1053
+
1054
+ def create_web_acl_with_common_rules(
1055
+ scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
1056
+ ):
1057
+ """
1058
+ Use CDK to create a web ACL based on an AWS common rule set with overrides.
1059
+ This function now expects a 'scope' argument, typically 'self' from your stack,
1060
+ as CfnWebACL requires a construct scope.
1061
+ """
1062
+
1063
+ # Create full list of rules
1064
+ rules = []
1065
+ aws_ruleset_names = [
1066
+ "AWSManagedRulesCommonRuleSet",
1067
+ "AWSManagedRulesKnownBadInputsRuleSet",
1068
+ "AWSManagedRulesAmazonIpReputationList",
1069
+ ]
1070
+
1071
+ # Use a separate counter to assign unique priorities sequentially
1072
+ priority_counter = 1
1073
+
1074
+ for aws_rule_name in aws_ruleset_names:
1075
+ current_rule_action_overrides = None
1076
+
1077
+ # All managed rule groups need an override_action.
1078
+ # 'none' means use the managed rule group's default action.
1079
+ current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
1080
+
1081
+ current_priority = priority_counter
1082
+ priority_counter += 1
1083
+
1084
+ if aws_rule_name == "AWSManagedRulesCommonRuleSet":
1085
+ current_rule_action_overrides = [
1086
+ wafv2.CfnWebACL.RuleActionOverrideProperty(
1087
+ name="SizeRestrictions_BODY",
1088
+ action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
1089
+ )
1090
+ ]
1091
+ # No need to set current_override_action here, it's already set above.
1092
+ # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
1093
+ # For now, it will get priority 1 from the counter.
1094
+
1095
+ rule_property = wafv2.CfnWebACL.RuleProperty(
1096
+ name=aws_rule_name,
1097
+ priority=current_priority,
1098
+ statement=wafv2.CfnWebACL.StatementProperty(
1099
+ managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
1100
+ vendor_name="AWS",
1101
+ name=aws_rule_name,
1102
+ rule_action_overrides=current_rule_action_overrides,
1103
+ )
1104
+ ),
1105
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1106
+ cloud_watch_metrics_enabled=True,
1107
+ metric_name=aws_rule_name,
1108
+ sampled_requests_enabled=True,
1109
+ ),
1110
+ override_action=current_override_action, # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
1111
+ )
1112
+
1113
+ rules.append(rule_property)
1114
+
1115
+ # Add the rate limit rule
1116
+ rate_limit_priority = priority_counter # Use the next available priority
1117
+ rules.append(
1118
+ wafv2.CfnWebACL.RuleProperty(
1119
+ name="RateLimitRule",
1120
+ priority=rate_limit_priority,
1121
+ statement=wafv2.CfnWebACL.StatementProperty(
1122
+ rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
1123
+ limit=1000, aggregate_key_type="IP"
1124
+ )
1125
+ ),
1126
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1127
+ cloud_watch_metrics_enabled=True,
1128
+ metric_name="RateLimitRule",
1129
+ sampled_requests_enabled=True,
1130
+ ),
1131
+ action=wafv2.CfnWebACL.RuleActionProperty(block={}),
1132
+ )
1133
+ )
1134
+
1135
+ web_acl = wafv2.CfnWebACL(
1136
+ scope,
1137
+ "WebACL",
1138
+ name=web_acl_name,
1139
+ default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
1140
+ scope=waf_scope,
1141
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1142
+ cloud_watch_metrics_enabled=True,
1143
+ metric_name="webACL",
1144
+ sampled_requests_enabled=True,
1145
+ ),
1146
+ rules=rules,
1147
+ )
1148
+
1149
+ CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
1150
+
1151
+ return web_acl
1152
+
1153
+
1154
+ def check_web_acl_exists(
1155
+ web_acl_name: str, scope: str, region_name: str = None
1156
+ ) -> tuple[bool, dict]:
1157
+ """
1158
+ Checks if a Web ACL with the given name and scope exists.
1159
+
1160
+ Args:
1161
+ web_acl_name: The name of the Web ACL to check.
1162
+ scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
1163
+ region_name: The AWS region to check in. Required for REGIONAL scope.
1164
+ If None, uses the default session region. For CLOUDFRONT,
1165
+ the region should be 'us-east-1'.
1166
+
1167
+ Returns:
1168
+ A tuple:
1169
+ - The first element is True if the Web ACL exists, False otherwise.
1170
+ - The second element is the Web ACL object (dictionary) if found,
1171
+ None otherwise.
1172
+ """
1173
+ if scope not in ["CLOUDFRONT", "REGIONAL"]:
1174
+ raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
1175
+
1176
+ if scope == "REGIONAL" and not region_name:
1177
+ raise ValueError("Region name is required for REGIONAL scope")
1178
+
1179
+ if scope == "CLOUDFRONT":
1180
+ region_name = "us-east-1" # CloudFront scope requires us-east-1
1181
+
1182
+ if region_name:
1183
+ waf_client = boto3.client("wafv2", region_name=region_name)
1184
+ else:
1185
+ waf_client = boto3.client("wafv2")
1186
+ try:
1187
+ response = waf_client.list_web_acls(Scope=scope)
1188
+ if "WebACLs" in response:
1189
+ for web_acl in response["WebACLs"]:
1190
+ if web_acl["Name"] == web_acl_name:
1191
+ # Describe the Web ACL to get the full object.
1192
+ describe_response = waf_client.describe_web_acl(
1193
+ Name=web_acl_name, Scope=scope
1194
+ )
1195
+ return True, describe_response["WebACL"]
1196
+ return False, {}
1197
+ else:
1198
+ return False, {}
1199
+ except ClientError as e:
1200
+ # Check for the error code indicating the web ACL doesn't exist.
1201
+ if e.response["Error"]["Code"] == "ResourceNotFoundException":
1202
+ return False, {}
1203
+ else:
1204
+ # Re-raise other exceptions.
1205
+ raise
1206
+ except Exception as e:
1207
+ print(f"An unexpected error occurred: {e}")
1208
+ return False, {}
1209
+
1210
+
1211
+ def add_alb_https_listener_with_cert(
1212
+ scope: Construct,
1213
+ logical_id: str, # A unique ID for this listener construct
1214
+ alb: elb.ApplicationLoadBalancer,
1215
+ acm_certificate_arn: Optional[
1216
+ str
1217
+ ], # Optional: If None, no HTTPS listener will be created
1218
+ default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
1219
+ listener_port_https: int = 443,
1220
+ listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
1221
+ # --- Cognito Authentication Parameters ---
1222
+ enable_cognito_auth: bool = False,
1223
+ cognito_user_pool: Optional[cognito.IUserPool] = None,
1224
+ cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
1225
+ cognito_user_pool_domain: Optional[
1226
+ str
1227
+ ] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
1228
+ cognito_auth_scope: Optional[
1229
+ str
1230
+ ] = "openid profile email", # Default recommended scope
1231
+ cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
1232
+ stickiness_cookie_duration=None,
1233
+ # --- End Cognito Parameters ---
1234
+ ) -> Optional[elb.ApplicationListener]:
1235
+ """
1236
+ Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
1237
+ and optionally enables Cognito User Pool authentication.
1238
+
1239
+ Args:
1240
+ scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
1241
+ logical_id (str): A unique logical ID for the listener construct within the stack.
1242
+ alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
1243
+ acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
1244
+ If None, the HTTPS listener will NOT be created.
1245
+ default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
1246
+ This is mandatory for a functional listener.
1247
+ listener_port_https (int): The HTTPS port to listen on (default: 443).
1248
+ listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
1249
+ If False (recommended), ensure your ALB's security group allows
1250
+ inbound traffic on this port from desired sources.
1251
+ enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
1252
+ cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
1253
+ cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
1254
+ cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
1255
+ cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
1256
+ cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
1257
+ Defaults to AUTHENTICATE (redirect to login).
1258
+
1259
+ Returns:
1260
+ Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
1261
+ None if no ACM certificate ARN was provided.
1262
+ """
1263
+ https_listener = None
1264
+ if acm_certificate_arn:
1265
+ certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
1266
+ print(
1267
+ f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
1268
+ )
1269
+
1270
+ # Determine the default action based on whether Cognito auth is enabled
1271
+ default_action = None
1272
+ if enable_cognito_auth is True:
1273
+ if not all(
1274
+ [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
1275
+ ):
1276
+ raise ValueError(
1277
+ "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
1278
+ )
1279
+ print(
1280
+ f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
1281
+ )
1282
+
1283
+ default_action = elb_act.AuthenticateCognitoAction(
1284
+ next=elb.ListenerAction.forward(
1285
+ [default_target_group]
1286
+ ), # After successful auth, forward to TG
1287
+ user_pool=cognito_user_pool,
1288
+ user_pool_client=cognito_user_pool_client,
1289
+ user_pool_domain=cognito_user_pool_domain,
1290
+ scope=cognito_auth_scope,
1291
+ on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
1292
+ session_timeout=stickiness_cookie_duration,
1293
+ # Additional options you might want to configure:
1294
+ # session_cookie_name="AWSELBCookies"
1295
+ )
1296
+ else:
1297
+ default_action = elb.ListenerAction.forward([default_target_group])
1298
+ print("Cognito authentication is NOT enabled for this listener.")
1299
+
1300
+ # Add the HTTPS listener
1301
+ https_listener = alb.add_listener(
1302
+ logical_id,
1303
+ port=listener_port_https,
1304
+ open=listener_open_to_internet,
1305
+ certificates=certificates_list,
1306
+ default_action=default_action, # Use the determined default action
1307
+ )
1308
+ print(f"ALB HTTPS listener on port {listener_port_https} defined.")
1309
+ else:
1310
+ print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
1311
+
1312
+ return https_listener
1313
+
1314
+
1315
+ def ensure_folder_exists(output_folder: str):
1316
+ """Checks if the specified folder exists, creates it if not."""
1317
+
1318
+ if not os.path.exists(output_folder):
1319
+ # Create the folder if it doesn't exist
1320
+ os.makedirs(output_folder, exist_ok=True)
1321
+ print(f"Created the {output_folder} folder.")
1322
+ else:
1323
+ print(f"The {output_folder} folder already exists.")
1324
+
1325
+
1326
+ def create_basic_config_env(
1327
+ out_dir: str = "config",
1328
+ S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
1329
+ S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
1330
+ ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
1331
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
1332
+ USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
1333
+ ):
1334
+ """
1335
+ Create a basic config.env file for the user to use with their newly deployed redaction app.
1336
+ """
1337
+ variables = {
1338
+ "COGNITO_AUTH": "True",
1339
+ "RUN_AWS_FUNCTIONS": "True",
1340
+ "DISPLAY_FILE_NAMES_IN_LOGS": "False",
1341
+ "SESSION_OUTPUT_FOLDER": "True",
1342
+ "SAVE_LOGS_TO_DYNAMODB": "True",
1343
+ "SHOW_COSTS": "True",
1344
+ "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
1345
+ "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
1346
+ "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
1347
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
1348
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
1349
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
1350
+ "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
1351
+ }
1352
+
1353
+ # Write variables to .env file
1354
+ ensure_folder_exists(out_dir + "/")
1355
+ env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
1356
+
1357
+ # It's good practice to ensure the file exists before calling set_key repeatedly.
1358
+ # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
1359
+ if not os.path.exists(env_file_path):
1360
+ with open(env_file_path, "w"):
1361
+ pass # Create empty file
1362
+
1363
+ for key, value in variables.items():
1364
+ set_key(env_file_path, key, str(value), quote_mode="never")
1365
+
1366
+ return variables
1367
+
1368
+
1369
+ def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
1370
+ """
1371
+ Start an existing Codebuild project build
1372
+ """
1373
+
1374
+ # --- Initialize CodeBuild client ---
1375
+ client = boto3.client("codebuild", region_name=AWS_REGION)
1376
+
1377
+ try:
1378
+ print(f"Attempting to start build for project: {PROJECT_NAME}")
1379
+
1380
+ response = client.start_build(projectName=PROJECT_NAME)
1381
+
1382
+ build_id = response["build"]["id"]
1383
+ print(f"Successfully started build with ID: {build_id}")
1384
+ print(f"Build ARN: {response['build']['arn']}")
1385
+ print("Build URL (approximate - construct based on region and ID):")
1386
+ print(
1387
+ f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
1388
+ )
1389
+
1390
+ # You can inspect the full response if needed
1391
+ # print("\nFull response:")
1392
+ # import json
1393
+ # print(json.dumps(response, indent=2))
1394
+
1395
+ except client.exceptions.ResourceNotFoundException:
1396
+ print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
1397
+ except Exception as e:
1398
+ print(f"An unexpected error occurred: {e}")
1399
+
1400
+
1401
+ def upload_file_to_s3(
1402
+ local_file_paths: List[str],
1403
+ s3_key: str,
1404
+ s3_bucket: str,
1405
+ RUN_AWS_FUNCTIONS: str = "1",
1406
+ ):
1407
+ """
1408
+ Uploads a file from local machine to Amazon S3.
1409
+
1410
+ Args:
1411
+ - local_file_path: Local file path(s) of the file(s) to upload.
1412
+ - s3_key: Key (path) to the file in the S3 bucket.
1413
+ - s3_bucket: Name of the S3 bucket.
1414
+
1415
+ Returns:
1416
+ - Message as variable/printed to console
1417
+ """
1418
+ final_out_message = []
1419
+ final_out_message_str = ""
1420
+
1421
+ if RUN_AWS_FUNCTIONS == "1":
1422
+ try:
1423
+ if s3_bucket and local_file_paths:
1424
+
1425
+ s3_client = boto3.client("s3", region_name=AWS_REGION)
1426
+
1427
+ if isinstance(local_file_paths, str):
1428
+ local_file_paths = [local_file_paths]
1429
+
1430
+ for file in local_file_paths:
1431
+ if s3_client:
1432
+ # print(s3_client)
1433
+ try:
1434
+ # Get file name off file path
1435
+ file_name = os.path.basename(file)
1436
+
1437
+ s3_key_full = s3_key + file_name
1438
+ print("S3 key: ", s3_key_full)
1439
+
1440
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
1441
+ out_message = (
1442
+ "File " + file_name + " uploaded successfully!"
1443
+ )
1444
+ print(out_message)
1445
+
1446
+ except Exception as e:
1447
+ out_message = f"Error uploading file(s): {e}"
1448
+ print(out_message)
1449
+
1450
+ final_out_message.append(out_message)
1451
+ final_out_message_str = "\n".join(final_out_message)
1452
+
1453
+ else:
1454
+ final_out_message_str = "Could not connect to AWS."
1455
+ else:
1456
+ final_out_message_str = (
1457
+ "At least one essential variable is empty, could not upload to S3"
1458
+ )
1459
+ except Exception as e:
1460
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
1461
+ print(final_out_message_str)
1462
+ else:
1463
+ final_out_message_str = "App not set to run AWS functions"
1464
+
1465
+ return final_out_message_str
1466
+
1467
+
1468
+ # Initialize ECS client
1469
+ def start_ecs_task(cluster_name, service_name):
1470
+ ecs_client = boto3.client("ecs")
1471
+
1472
+ try:
1473
+ # Update the service to set the desired count to 1
1474
+ ecs_client.update_service(
1475
+ cluster=cluster_name, service=service_name, desiredCount=1
1476
+ )
1477
+ return {
1478
+ "statusCode": 200,
1479
+ "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
1480
+ }
1481
+ except Exception as e:
1482
+ return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}
cdk/cdk_stack.py ADDED
@@ -0,0 +1,1869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json # You might still need json if loading task_definition.json
2
+ import os
3
+ from typing import Any, Dict, List
4
+
5
+ from aws_cdk import (
6
+ CfnOutput, # <-- Import CfnOutput directly
7
+ Duration,
8
+ RemovalPolicy,
9
+ SecretValue,
10
+ Stack,
11
+ )
12
+ from aws_cdk import aws_cloudfront as cloudfront
13
+ from aws_cdk import aws_cloudfront_origins as origins
14
+ from aws_cdk import aws_codebuild as codebuild
15
+ from aws_cdk import aws_cognito as cognito
16
+ from aws_cdk import aws_dynamodb as dynamodb # Import the DynamoDB module
17
+ from aws_cdk import aws_ec2 as ec2
18
+ from aws_cdk import aws_ecr as ecr
19
+ from aws_cdk import aws_ecs as ecs
20
+ from aws_cdk import aws_elasticloadbalancingv2 as elbv2
21
+ from aws_cdk import aws_iam as iam
22
+ from aws_cdk import aws_kms as kms
23
+ from aws_cdk import aws_logs as logs
24
+ from aws_cdk import aws_s3 as s3
25
+ from aws_cdk import aws_secretsmanager as secretsmanager
26
+ from aws_cdk import aws_wafv2 as wafv2
27
+ from cdk_config import (
28
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
29
+ ACM_SSL_CERTIFICATE_ARN,
30
+ ALB_NAME,
31
+ ALB_NAME_SECURITY_GROUP_NAME,
32
+ ALB_TARGET_GROUP_NAME,
33
+ AWS_ACCOUNT_ID,
34
+ AWS_MANAGED_TASK_ROLES_LIST,
35
+ AWS_REGION,
36
+ CDK_PREFIX,
37
+ CLOUDFRONT_DISTRIBUTION_NAME,
38
+ CLOUDFRONT_GEO_RESTRICTION,
39
+ CLUSTER_NAME,
40
+ CODEBUILD_PROJECT_NAME,
41
+ CODEBUILD_ROLE_NAME,
42
+ COGNITO_ACCESS_TOKEN_VALIDITY,
43
+ COGNITO_ID_TOKEN_VALIDITY,
44
+ COGNITO_REDIRECTION_URL,
45
+ COGNITO_REFRESH_TOKEN_VALIDITY,
46
+ COGNITO_USER_POOL_CLIENT_NAME,
47
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME,
48
+ COGNITO_USER_POOL_DOMAIN_PREFIX,
49
+ COGNITO_USER_POOL_NAME,
50
+ CUSTOM_HEADER,
51
+ CUSTOM_HEADER_VALUE,
52
+ CUSTOM_KMS_KEY_NAME,
53
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
54
+ ECR_CDK_REPO_NAME,
55
+ ECS_LOG_GROUP_NAME,
56
+ ECS_READ_ONLY_FILE_SYSTEM,
57
+ ECS_SECURITY_GROUP_NAME,
58
+ ECS_SERVICE_NAME,
59
+ ECS_TASK_CPU_SIZE,
60
+ ECS_TASK_EXECUTION_ROLE_NAME,
61
+ ECS_TASK_MEMORY_SIZE,
62
+ ECS_TASK_ROLE_NAME,
63
+ ECS_USE_FARGATE_SPOT,
64
+ EXISTING_IGW_ID,
65
+ FARGATE_TASK_DEFINITION_NAME,
66
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
67
+ GITHUB_REPO_BRANCH,
68
+ GITHUB_REPO_NAME,
69
+ GITHUB_REPO_USERNAME,
70
+ GRADIO_SERVER_PORT,
71
+ LOAD_BALANCER_WEB_ACL_NAME,
72
+ NAT_GATEWAY_NAME,
73
+ NEW_VPC_CIDR,
74
+ NEW_VPC_DEFAULT_NAME,
75
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
76
+ PRIVATE_SUBNET_CIDR_BLOCKS,
77
+ PRIVATE_SUBNETS_TO_USE,
78
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
79
+ PUBLIC_SUBNET_CIDR_BLOCKS,
80
+ PUBLIC_SUBNETS_TO_USE,
81
+ S3_LOG_CONFIG_BUCKET_NAME,
82
+ S3_OUTPUT_BUCKET_NAME,
83
+ SAVE_LOGS_TO_DYNAMODB,
84
+ SINGLE_NAT_GATEWAY_ID,
85
+ TASK_DEFINITION_FILE_LOCATION,
86
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
87
+ USE_CLOUDFRONT,
88
+ USE_CUSTOM_KMS_KEY,
89
+ VPC_NAME,
90
+ WEB_ACL_NAME,
91
+ )
92
+ from cdk_functions import ( # Only keep CDK-native functions
93
+ add_alb_https_listener_with_cert,
94
+ add_custom_policies,
95
+ create_nat_gateway,
96
+ create_subnets,
97
+ create_web_acl_with_common_rules,
98
+ )
99
+ from constructs import Construct
100
+
101
+
102
+ def _get_env_list(env_var_name: str) -> List[str]:
103
+ """Parses a comma-separated environment variable into a list of strings."""
104
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
105
+ if not value:
106
+ return []
107
+ # Split by comma and filter out any empty strings that might result from extra commas
108
+ return [s.strip() for s in value.split(",") if s.strip()]
109
+
110
+
111
+ # 1. Try to load CIDR/AZs from environment variables
112
+ if PUBLIC_SUBNETS_TO_USE:
113
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
114
+ if PRIVATE_SUBNETS_TO_USE:
115
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
116
+
117
+ if PUBLIC_SUBNET_CIDR_BLOCKS:
118
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
119
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES:
120
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
121
+ if PRIVATE_SUBNET_CIDR_BLOCKS:
122
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
123
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES:
124
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
125
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES"
126
+ )
127
+
128
+ if AWS_MANAGED_TASK_ROLES_LIST:
129
+ AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
130
+
131
+
132
+ class CdkStack(Stack):
133
+
134
+ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
135
+ super().__init__(scope, construct_id, **kwargs)
136
+
137
+ # --- Helper to get context values ---
138
+ def get_context_bool(key: str, default: bool = False) -> bool:
139
+ return self.node.try_get_context(key) or default
140
+
141
+ def get_context_str(key: str, default: str = None) -> str:
142
+ return self.node.try_get_context(key) or default
143
+
144
+ def get_context_dict(key: str, default: dict = None) -> dict:
145
+ return self.node.try_get_context(key) or default
146
+
147
+ def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
148
+ ctx_value = self.node.try_get_context(key)
149
+ if not isinstance(ctx_value, list):
150
+ print(
151
+ f"Warning: Context key '{key}' not found or not a list. Returning empty list."
152
+ )
153
+ return []
154
+ # Optional: Add validation that all items in the list are dicts
155
+ return ctx_value
156
+
157
+ self.template_options.description = "Deployment of the 'doc_redaction' PDF, image, and XLSX/CSV redaction app. Git repo available at: https://github.com/seanpedrick-case/doc_redaction."
158
+
159
+ # --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
160
+ new_vpc_created = False
161
+ if VPC_NAME:
162
+ print("Looking for current VPC:", VPC_NAME)
163
+ try:
164
+ vpc = ec2.Vpc.from_lookup(self, "VPC", vpc_name=VPC_NAME)
165
+ print("Successfully looked up VPC:", vpc.vpc_id)
166
+ except Exception as e:
167
+ raise Exception(
168
+ f"Could not look up VPC with name '{VPC_NAME}' due to: {e}"
169
+ )
170
+
171
+ elif NEW_VPC_DEFAULT_NAME:
172
+ new_vpc_created = True
173
+ print(
174
+ f"NEW_VPC_DEFAULT_NAME ('{NEW_VPC_DEFAULT_NAME}') is set. Creating a new VPC."
175
+ )
176
+
177
+ # Configuration for the new VPC
178
+ # You can make these configurable via context as well, e.g.,
179
+ # new_vpc_cidr = self.node.try_get_context("new_vpc_cidr") or "10.0.0.0/24"
180
+ # new_vpc_max_azs = self.node.try_get_context("new_vpc_max_azs") or 2 # Use 2 AZs by default for HA
181
+ # new_vpc_nat_gateways = self.node.try_get_context("new_vpc_nat_gateways") or new_vpc_max_azs # One NAT GW per AZ for HA
182
+ # or 1 for cost savings if acceptable
183
+ if not NEW_VPC_CIDR:
184
+ raise Exception(
185
+ "App has been instructed to create a new VPC but not VPC CDR range provided to variable NEW_VPC_CIDR"
186
+ )
187
+
188
+ print("Provided NEW_VPC_CIDR range:", NEW_VPC_CIDR)
189
+
190
+ new_vpc_cidr = NEW_VPC_CIDR
191
+ new_vpc_max_azs = 2 # Creates resources in 2 AZs. Adjust as needed.
192
+
193
+ # For "a NAT gateway", you can set nat_gateways=1.
194
+ # For resilience (NAT GW per AZ), set nat_gateways=new_vpc_max_azs.
195
+ # The Vpc construct will create NAT Gateway(s) if subnet_type PRIVATE_WITH_EGRESS is used
196
+ # and nat_gateways > 0.
197
+ new_vpc_nat_gateways = (
198
+ 1 # Creates a single NAT Gateway for cost-effectiveness.
199
+ )
200
+ # If you need one per AZ for higher availability, set this to new_vpc_max_azs.
201
+
202
+ vpc = ec2.Vpc(
203
+ self,
204
+ "MyNewLogicalVpc", # This is the CDK construct ID
205
+ vpc_name=NEW_VPC_DEFAULT_NAME,
206
+ ip_addresses=ec2.IpAddresses.cidr(new_vpc_cidr),
207
+ max_azs=new_vpc_max_azs,
208
+ nat_gateways=new_vpc_nat_gateways, # Number of NAT gateways to create
209
+ subnet_configuration=[
210
+ ec2.SubnetConfiguration(
211
+ name="Public", # Name prefix for public subnets
212
+ subnet_type=ec2.SubnetType.PUBLIC,
213
+ cidr_mask=28, # Adjust CIDR mask as needed (e.g., /24 provides ~250 IPs per subnet)
214
+ ),
215
+ ec2.SubnetConfiguration(
216
+ name="Private", # Name prefix for private subnets
217
+ subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, # Ensures these subnets have NAT Gateway access
218
+ cidr_mask=28, # Adjust CIDR mask as needed
219
+ ),
220
+ # You could also add ec2.SubnetType.PRIVATE_ISOLATED if needed
221
+ ],
222
+ # Internet Gateway is created and configured automatically for PUBLIC subnets.
223
+ # Route tables for public subnets will point to the IGW.
224
+ # Route tables for PRIVATE_WITH_EGRESS subnets will point to the NAT Gateway(s).
225
+ )
226
+ print(
227
+ f"Successfully created new VPC: {vpc.vpc_id} with name '{NEW_VPC_DEFAULT_NAME}'"
228
+ )
229
+ # If nat_gateways > 0, vpc.nat_gateway_ips will contain EIPs if Vpc created them.
230
+ # vpc.public_subnets, vpc.private_subnets, vpc.isolated_subnets are populated.
231
+
232
+ else:
233
+ raise Exception(
234
+ "VPC_NAME for current VPC not found, and NEW_VPC_DEFAULT_NAME not found to create a new VPC"
235
+ )
236
+
237
+ # --- Subnet Handling (Check Context and Create/Import) ---
238
+ # Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
239
+ # We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
240
+ self.public_subnets: List[ec2.ISubnet] = []
241
+ self.private_subnets: List[ec2.ISubnet] = []
242
+ # Store L1 CfnRouteTables explicitly if you need to reference them later
243
+ self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
244
+ self.public_route_tables_cfn: List[ec2.CfnRouteTable] = (
245
+ []
246
+ ) # New: to store public RTs
247
+
248
+ names_to_create_private = []
249
+ names_to_create_public = []
250
+
251
+ if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
252
+ print(
253
+ "Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets."
254
+ )
255
+
256
+ print("vpc.public_subnets:", vpc.public_subnets)
257
+ print("vpc.private_subnets:", vpc.private_subnets)
258
+
259
+ if (
260
+ vpc.public_subnets
261
+ ): # These are already one_per_az if max_azs was used and Vpc created them
262
+ self.public_subnets.extend(vpc.public_subnets)
263
+ else:
264
+ self.node.add_warning("No public subnets found in the VPC.")
265
+
266
+ # Get private subnets with egress specifically
267
+ # selected_private_subnets_with_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)
268
+
269
+ print(
270
+ f"Selected from VPC: {len(self.public_subnets)} public, {len(self.private_subnets)} private_with_egress subnets."
271
+ )
272
+
273
+ if (
274
+ len(self.public_subnets) < 1 or len(self.private_subnets) < 1
275
+ ): # Simplified check for new VPC
276
+ # If new_vpc_max_azs was 1, you'd have 1 of each. If 2, then 2 of each.
277
+ # The original check ' < 2' might be too strict if new_vpc_max_azs=1
278
+ pass # For new VPC, allow single AZ setups if configured that way. The VPC construct ensures one per AZ up to max_azs.
279
+
280
+ if not self.public_subnets and not self.private_subnets:
281
+ print(
282
+ "Error: No public or private subnets could be found in the VPC for automatic selection. "
283
+ "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
284
+ )
285
+ raise RuntimeError("No suitable subnets found for automatic selection.")
286
+ else:
287
+ print(
288
+ f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC properties."
289
+ )
290
+
291
+ selected_public_subnets = vpc.select_subnets(
292
+ subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True
293
+ )
294
+ private_subnets_egress = vpc.select_subnets(
295
+ subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True
296
+ )
297
+
298
+ if private_subnets_egress.subnets:
299
+ self.private_subnets.extend(private_subnets_egress.subnets)
300
+ else:
301
+ self.node.add_warning(
302
+ "No PRIVATE_WITH_EGRESS subnets found in the VPC."
303
+ )
304
+
305
+ try:
306
+ private_subnets_isolated = vpc.select_subnets(
307
+ subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True
308
+ )
309
+ except Exception as e:
310
+ private_subnets_isolated = []
311
+ print("Could not find any isolated subnets due to:", e)
312
+
313
+ ###
314
+ combined_subnet_objects = []
315
+
316
+ if private_subnets_isolated:
317
+ if private_subnets_egress.subnets:
318
+ # Add the first PRIVATE_WITH_EGRESS subnet
319
+ combined_subnet_objects.append(private_subnets_egress.subnets[0])
320
+ elif not private_subnets_isolated:
321
+ if private_subnets_egress.subnets:
322
+ # Add the first PRIVATE_WITH_EGRESS subnet
323
+ combined_subnet_objects.extend(private_subnets_egress.subnets)
324
+ else:
325
+ self.node.add_warning(
326
+ "No PRIVATE_WITH_EGRESS subnets found to select the first one."
327
+ )
328
+
329
+ # Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
330
+ try:
331
+ if len(private_subnets_isolated.subnets) > 1:
332
+ combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
333
+ elif (
334
+ private_subnets_isolated.subnets
335
+ ): # Only 1 isolated subnet, add a warning if [1:] was desired
336
+ self.node.add_warning(
337
+ "Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty."
338
+ )
339
+ else:
340
+ self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
341
+ except Exception as e:
342
+ print("Could not identify private isolated subnets due to:", e)
343
+
344
+ # Create an ec2.SelectedSubnets object from the combined private subnet list.
345
+ selected_private_subnets = vpc.select_subnets(
346
+ subnets=combined_subnet_objects
347
+ )
348
+
349
+ print("selected_public_subnets:", selected_public_subnets)
350
+ print("selected_private_subnets:", selected_private_subnets)
351
+
352
+ if (
353
+ len(selected_public_subnets.subnet_ids) < 2
354
+ or len(selected_private_subnets.subnet_ids) < 2
355
+ ):
356
+ raise Exception(
357
+ "Need at least two public or private subnets in different availability zones"
358
+ )
359
+
360
+ if not selected_public_subnets and not selected_private_subnets:
361
+ # If no subnets could be found even with automatic selection, raise an error.
362
+ # This ensures the stack doesn't proceed if it absolutely needs subnets.
363
+ print(
364
+ "Error: No existing public or private subnets could be found in the VPC for automatic selection. "
365
+ "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets."
366
+ )
367
+ raise RuntimeError("No suitable subnets found for automatic selection.")
368
+ else:
369
+ self.public_subnets = selected_public_subnets.subnets
370
+ self.private_subnets = selected_private_subnets.subnets
371
+ print(
372
+ f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery."
373
+ )
374
+
375
+ print("self.public_subnets:", self.public_subnets)
376
+ print("self.private_subnets:", self.private_subnets)
377
+ # Since subnets are now assigned, we can exit this processing block.
378
+ # The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
379
+
380
+ checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
381
+ get_context_dict("checked_private_subnets")
382
+
383
+ public_subnets_data_for_creation_ctx = get_context_list_of_dicts(
384
+ "public_subnets_to_create"
385
+ )
386
+ private_subnets_data_for_creation_ctx = get_context_list_of_dicts(
387
+ "private_subnets_to_create"
388
+ )
389
+
390
+ # --- 3. Process Public Subnets ---
391
+ print("\n--- Processing Public Subnets ---")
392
+ # Import existing public subnets
393
+ if checked_public_subnets_ctx:
394
+ for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
395
+ subnet_info = checked_public_subnets_ctx.get(subnet_name)
396
+ if subnet_info and subnet_info.get("exists"):
397
+ subnet_id = subnet_info.get("id")
398
+ if not subnet_id:
399
+ raise RuntimeError(
400
+ f"Context for existing public subnet '{subnet_name}' is missing 'id'."
401
+ )
402
+ try:
403
+ ec2.Subnet.from_subnet_id(
404
+ self,
405
+ f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}",
406
+ subnet_id,
407
+ )
408
+ # self.public_subnets.append(imported_subnet)
409
+ print(
410
+ f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})"
411
+ )
412
+ except Exception as e:
413
+ raise RuntimeError(
414
+ f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}"
415
+ )
416
+
417
+ # Create new public subnets based on public_subnets_data_for_creation_ctx
418
+ if public_subnets_data_for_creation_ctx:
419
+ names_to_create_public = [
420
+ s["name"] for s in public_subnets_data_for_creation_ctx
421
+ ]
422
+ cidrs_to_create_public = [
423
+ s["cidr"] for s in public_subnets_data_for_creation_ctx
424
+ ]
425
+ azs_to_create_public = [
426
+ s["az"] for s in public_subnets_data_for_creation_ctx
427
+ ]
428
+
429
+ if names_to_create_public:
430
+ print(
431
+ f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}"
432
+ )
433
+ newly_created_public_subnets, newly_created_public_rts_cfn = (
434
+ create_subnets(
435
+ self,
436
+ vpc,
437
+ CDK_PREFIX,
438
+ names_to_create_public,
439
+ cidrs_to_create_public,
440
+ azs_to_create_public,
441
+ is_public=True,
442
+ internet_gateway_id=EXISTING_IGW_ID,
443
+ )
444
+ )
445
+ self.public_subnets.extend(newly_created_public_subnets)
446
+ self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
447
+
448
+ if (
449
+ not self.public_subnets
450
+ and not names_to_create_public
451
+ and not PUBLIC_SUBNETS_TO_USE
452
+ ):
453
+ raise Exception("No public subnets found or created, exiting.")
454
+
455
+ # --- NAT Gateway Creation/Lookup ---
456
+ print("Creating NAT gateway/located existing")
457
+ self.single_nat_gateway_id = None
458
+
459
+ nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID
460
+
461
+ if nat_gw_id_from_context:
462
+ print(
463
+ f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}"
464
+ )
465
+ self.single_nat_gateway_id = nat_gw_id_from_context
466
+
467
+ elif (
468
+ new_vpc_created
469
+ and new_vpc_nat_gateways > 0
470
+ and hasattr(vpc, "nat_gateways")
471
+ and vpc.nat_gateways
472
+ ):
473
+ self.single_nat_gateway_id = vpc.nat_gateways[0].gateway_id
474
+ print(
475
+ f"Using NAT Gateway {self.single_nat_gateway_id} created by the new VPC construct."
476
+ )
477
+
478
+ if not self.single_nat_gateway_id:
479
+ print("Creating a new NAT gateway")
480
+
481
+ if hasattr(vpc, "nat_gateways") and vpc.nat_gateways:
482
+ print("Existing NAT gateway found in vpc")
483
+ pass
484
+
485
+ # If not in context, create a new one, but only if we have a public subnet.
486
+ elif self.public_subnets:
487
+ print("NAT Gateway ID not found in context. Creating a new one.")
488
+ # Place the NAT GW in the first available public subnet
489
+ first_public_subnet = self.public_subnets[0]
490
+
491
+ self.single_nat_gateway_id = create_nat_gateway(
492
+ self,
493
+ first_public_subnet,
494
+ nat_gateway_name=NAT_GATEWAY_NAME,
495
+ nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID,
496
+ )
497
+ else:
498
+ print(
499
+ "WARNING: No public subnets available and NAT gateway not found in existing VPC. Cannot create a NAT Gateway."
500
+ )
501
+
502
+ # --- 4. Process Private Subnets ---
503
+ print("\n--- Processing Private Subnets ---")
504
+ # ... (rest of your existing subnet processing logic for checked_private_subnets_ctx) ...
505
+ # (This part for importing existing subnets remains the same)
506
+
507
+ # Create new private subnets
508
+ if private_subnets_data_for_creation_ctx:
509
+ names_to_create_private = [
510
+ s["name"] for s in private_subnets_data_for_creation_ctx
511
+ ]
512
+ cidrs_to_create_private = [
513
+ s["cidr"] for s in private_subnets_data_for_creation_ctx
514
+ ]
515
+ azs_to_create_private = [
516
+ s["az"] for s in private_subnets_data_for_creation_ctx
517
+ ]
518
+
519
+ if names_to_create_private:
520
+ print(
521
+ f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}"
522
+ )
523
+ # --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
524
+ # Ensure self.single_nat_gateway_id is available before this call
525
+ if not self.single_nat_gateway_id:
526
+ raise ValueError(
527
+ "A single NAT Gateway ID is required for private subnets but was not resolved."
528
+ )
529
+
530
+ newly_created_private_subnets_cfn, newly_created_private_rts_cfn = (
531
+ create_subnets(
532
+ self,
533
+ vpc,
534
+ CDK_PREFIX,
535
+ names_to_create_private,
536
+ cidrs_to_create_private,
537
+ azs_to_create_private,
538
+ is_public=False,
539
+ single_nat_gateway_id=self.single_nat_gateway_id, # Pass the single NAT Gateway ID
540
+ )
541
+ )
542
+ self.private_subnets.extend(newly_created_private_subnets_cfn)
543
+ self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
544
+ print(
545
+ f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation."
546
+ )
547
+ else:
548
+ print(
549
+ "No private subnets specified for creation in context ('private_subnets_to_create')."
550
+ )
551
+
552
+ # if not self.private_subnets:
553
+ # raise Exception("No private subnets found or created, exiting.")
554
+
555
+ if (
556
+ not self.private_subnets
557
+ and not names_to_create_private
558
+ and not PRIVATE_SUBNETS_TO_USE
559
+ ):
560
+ # This condition might need adjustment for new VPCs.
561
+ raise Exception("No private subnets found or created, exiting.")
562
+
563
+ # --- 5. Sanity Check and Output ---
564
+ # Output the single NAT Gateway ID for verification
565
+ if self.single_nat_gateway_id:
566
+ CfnOutput(
567
+ self,
568
+ "SingleNatGatewayId",
569
+ value=self.single_nat_gateway_id,
570
+ description="ID of the single NAT Gateway resolved or created.",
571
+ )
572
+ elif (
573
+ NEW_VPC_DEFAULT_NAME
574
+ and (self.node.try_get_context("new_vpc_nat_gateways") or 1) > 0
575
+ ):
576
+ print(
577
+ "INFO: A new VPC was created with NAT Gateway(s). Their routing is handled by the VPC construct. No single_nat_gateway_id was explicitly set for separate output."
578
+ )
579
+ else:
580
+ out_message = "WARNING: No single NAT Gateway was resolved or created explicitly by the script's logic after VPC setup."
581
+ print(out_message)
582
+ raise Exception(out_message)
583
+
584
+ # --- Outputs for other stacks/regions ---
585
+ # These are crucial for cross-stack, cross-region referencing
586
+
587
+ self.params = dict()
588
+ self.params["vpc_id"] = vpc.vpc_id
589
+ self.params["private_subnets"] = self.private_subnets
590
+ self.params["private_route_tables"] = self.private_route_tables_cfn
591
+ self.params["public_subnets"] = self.public_subnets
592
+ self.params["public_route_tables"] = self.public_route_tables_cfn
593
+
594
+ private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
595
+ public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
596
+
597
+ for sub in private_subnet_selection.subnets:
598
+ print(
599
+ "private subnet:",
600
+ sub.subnet_id,
601
+ "is in availability zone:",
602
+ sub.availability_zone,
603
+ )
604
+
605
+ for sub in public_subnet_selection.subnets:
606
+ print(
607
+ "public subnet:",
608
+ sub.subnet_id,
609
+ "is in availability zone:",
610
+ sub.availability_zone,
611
+ )
612
+
613
+ print("Private subnet route tables:", self.private_route_tables_cfn)
614
+
615
+ # Add the S3 Gateway Endpoint to the VPC
616
+ if names_to_create_private:
617
+ try:
618
+ s3_gateway_endpoint = vpc.add_gateway_endpoint(
619
+ "S3GatewayEndpoint",
620
+ service=ec2.GatewayVpcEndpointAwsService.S3,
621
+ subnets=[private_subnet_selection],
622
+ )
623
+ except Exception as e:
624
+ print("Could not add S3 gateway endpoint to subnets due to:", e)
625
+
626
+ # Output some useful information
627
+ CfnOutput(
628
+ self,
629
+ "VpcIdOutput",
630
+ value=vpc.vpc_id,
631
+ description="The ID of the VPC where the S3 Gateway Endpoint is deployed.",
632
+ )
633
+ CfnOutput(
634
+ self,
635
+ "S3GatewayEndpointService",
636
+ value=s3_gateway_endpoint.vpc_endpoint_id,
637
+ description="The id for the S3 Gateway Endpoint.",
638
+ ) # Specify the S3 service
639
+
640
+ # --- IAM Roles ---
641
+ if USE_CUSTOM_KMS_KEY == "1":
642
+ kms_key = kms.Key(
643
+ self,
644
+ "RedactionSharedKmsKey",
645
+ alias=CUSTOM_KMS_KEY_NAME,
646
+ removal_policy=RemovalPolicy.DESTROY,
647
+ )
648
+
649
+ custom_sts_kms_policy_dict = {
650
+ "Version": "2012-10-17",
651
+ "Statement": [
652
+ {
653
+ "Sid": "STSCallerIdentity",
654
+ "Effect": "Allow",
655
+ "Action": ["sts:GetCallerIdentity"],
656
+ "Resource": "*",
657
+ },
658
+ {
659
+ "Sid": "KMSAccess",
660
+ "Effect": "Allow",
661
+ "Action": ["kms:Encrypt", "kms:Decrypt", "kms:GenerateDataKey"],
662
+ "Resource": kms_key.key_arn, # Use key_arn, as it's the full ARN, safer than key_id
663
+ },
664
+ ],
665
+ }
666
+ else:
667
+ kms_key = None
668
+
669
+ custom_sts_kms_policy_dict = {
670
+ "Version": "2012-10-17",
671
+ "Statement": [
672
+ {
673
+ "Sid": "STSCallerIdentity",
674
+ "Effect": "Allow",
675
+ "Action": ["sts:GetCallerIdentity"],
676
+ "Resource": "*",
677
+ },
678
+ {
679
+ "Sid": "KMSSecretsManagerDecrypt", # Explicitly add decrypt for default key
680
+ "Effect": "Allow",
681
+ "Action": ["kms:Decrypt"],
682
+ "Resource": f"arn:aws:kms:{AWS_REGION}:{AWS_ACCOUNT_ID}:key/aws/secretsmanager",
683
+ },
684
+ ],
685
+ }
686
+ custom_sts_kms_policy = json.dumps(custom_sts_kms_policy_dict, indent=4)
687
+
688
+ try:
689
+ codebuild_role_name = CODEBUILD_ROLE_NAME
690
+
691
+ if get_context_bool(f"exists:{codebuild_role_name}"):
692
+ # If exists, lookup/import the role using ARN from context
693
+ role_arn = get_context_str(f"arn:{codebuild_role_name}")
694
+ if not role_arn:
695
+ raise ValueError(
696
+ f"Context value 'arn:{codebuild_role_name}' is required if role exists."
697
+ )
698
+ codebuild_role = iam.Role.from_role_arn(
699
+ self, "CodeBuildRole", role_arn=role_arn
700
+ )
701
+ print("Using existing CodeBuild role")
702
+ else:
703
+ # If not exists, create the role
704
+ codebuild_role = iam.Role(
705
+ self,
706
+ "CodeBuildRole", # Logical ID
707
+ role_name=codebuild_role_name, # Explicit resource name
708
+ assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com"),
709
+ )
710
+ codebuild_role.add_managed_policy(
711
+ iam.ManagedPolicy.from_aws_managed_policy_name(
712
+ "EC2InstanceProfileForImageBuilderECRContainerBuilds"
713
+ )
714
+ )
715
+ print("Successfully created new CodeBuild role")
716
+
717
+ task_role_name = ECS_TASK_ROLE_NAME
718
+ if get_context_bool(f"exists:{task_role_name}"):
719
+ role_arn = get_context_str(f"arn:{task_role_name}")
720
+ if not role_arn:
721
+ raise ValueError(
722
+ f"Context value 'arn:{task_role_name}' is required if role exists."
723
+ )
724
+ task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
725
+ print("Using existing ECS task role")
726
+ else:
727
+ task_role = iam.Role(
728
+ self,
729
+ "TaskRole", # Logical ID
730
+ role_name=task_role_name, # Explicit resource name
731
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
732
+ )
733
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
734
+ print(f"Adding {role} to policy")
735
+ task_role.add_managed_policy(
736
+ iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
737
+ )
738
+ task_role = add_custom_policies(
739
+ self, task_role, custom_policy_text=custom_sts_kms_policy
740
+ )
741
+ print("Successfully created new ECS task role")
742
+
743
+ execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
744
+ if get_context_bool(f"exists:{execution_role_name}"):
745
+ role_arn = get_context_str(f"arn:{execution_role_name}")
746
+ if not role_arn:
747
+ raise ValueError(
748
+ f"Context value 'arn:{execution_role_name}' is required if role exists."
749
+ )
750
+ execution_role = iam.Role.from_role_arn(
751
+ self, "ExecutionRole", role_arn=role_arn
752
+ )
753
+ print("Using existing ECS execution role")
754
+ else:
755
+ execution_role = iam.Role(
756
+ self,
757
+ "ExecutionRole", # Logical ID
758
+ role_name=execution_role_name, # Explicit resource name
759
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
760
+ )
761
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
762
+ execution_role.add_managed_policy(
763
+ iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}")
764
+ )
765
+ execution_role = add_custom_policies(
766
+ self, execution_role, custom_policy_text=custom_sts_kms_policy
767
+ )
768
+ print("Successfully created new ECS execution role")
769
+
770
+ except Exception as e:
771
+ raise Exception("Failed at IAM role step due to:", e)
772
+
773
+ # --- S3 Buckets ---
774
+ try:
775
+ log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
776
+ if get_context_bool(f"exists:{log_bucket_name}"):
777
+ bucket = s3.Bucket.from_bucket_name(
778
+ self, "LogConfigBucket", bucket_name=log_bucket_name
779
+ )
780
+ print("Using existing S3 bucket", log_bucket_name)
781
+ else:
782
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
783
+ bucket = s3.Bucket(
784
+ self,
785
+ "LogConfigBucket",
786
+ bucket_name=log_bucket_name,
787
+ versioned=False,
788
+ removal_policy=RemovalPolicy.DESTROY,
789
+ auto_delete_objects=True,
790
+ encryption=s3.BucketEncryption.KMS,
791
+ encryption_key=kms_key,
792
+ )
793
+ else:
794
+ bucket = s3.Bucket(
795
+ self,
796
+ "LogConfigBucket",
797
+ bucket_name=log_bucket_name,
798
+ versioned=False,
799
+ removal_policy=RemovalPolicy.DESTROY,
800
+ auto_delete_objects=True,
801
+ )
802
+
803
+ print("Created S3 bucket", log_bucket_name)
804
+
805
+ # Add policies - this will apply to both created and imported buckets
806
+ # CDK handles idempotent policy additions
807
+ bucket.add_to_resource_policy(
808
+ iam.PolicyStatement(
809
+ effect=iam.Effect.ALLOW,
810
+ principals=[task_role], # Pass the role object directly
811
+ actions=["s3:GetObject", "s3:PutObject"],
812
+ resources=[f"{bucket.bucket_arn}/*"],
813
+ )
814
+ )
815
+ bucket.add_to_resource_policy(
816
+ iam.PolicyStatement(
817
+ effect=iam.Effect.ALLOW,
818
+ principals=[task_role],
819
+ actions=["s3:ListBucket"],
820
+ resources=[bucket.bucket_arn],
821
+ )
822
+ )
823
+
824
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
825
+ if get_context_bool(f"exists:{output_bucket_name}"):
826
+ output_bucket = s3.Bucket.from_bucket_name(
827
+ self, "OutputBucket", bucket_name=output_bucket_name
828
+ )
829
+ print("Using existing Output bucket", output_bucket_name)
830
+ else:
831
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
832
+ output_bucket = s3.Bucket(
833
+ self,
834
+ "OutputBucket",
835
+ bucket_name=output_bucket_name,
836
+ lifecycle_rules=[
837
+ s3.LifecycleRule(
838
+ expiration=Duration.days(
839
+ int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
840
+ )
841
+ )
842
+ ],
843
+ versioned=False,
844
+ removal_policy=RemovalPolicy.DESTROY,
845
+ auto_delete_objects=True,
846
+ encryption=s3.BucketEncryption.KMS,
847
+ encryption_key=kms_key,
848
+ )
849
+ else:
850
+ output_bucket = s3.Bucket(
851
+ self,
852
+ "OutputBucket",
853
+ bucket_name=output_bucket_name,
854
+ lifecycle_rules=[
855
+ s3.LifecycleRule(
856
+ expiration=Duration.days(
857
+ int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
858
+ )
859
+ )
860
+ ],
861
+ versioned=False,
862
+ removal_policy=RemovalPolicy.DESTROY,
863
+ auto_delete_objects=True,
864
+ )
865
+
866
+ print("Created Output bucket:", output_bucket_name)
867
+
868
+ # Add policies to output bucket
869
+ output_bucket.add_to_resource_policy(
870
+ iam.PolicyStatement(
871
+ effect=iam.Effect.ALLOW,
872
+ principals=[task_role],
873
+ actions=["s3:GetObject", "s3:PutObject"],
874
+ resources=[f"{output_bucket.bucket_arn}/*"],
875
+ )
876
+ )
877
+ output_bucket.add_to_resource_policy(
878
+ iam.PolicyStatement(
879
+ effect=iam.Effect.ALLOW,
880
+ principals=[task_role],
881
+ actions=["s3:ListBucket"],
882
+ resources=[output_bucket.bucket_arn],
883
+ )
884
+ )
885
+
886
+ except Exception as e:
887
+ raise Exception("Could not handle S3 buckets due to:", e)
888
+
889
+ # --- Elastic Container Registry ---
890
+ try:
891
+ full_ecr_repo_name = ECR_CDK_REPO_NAME
892
+ if get_context_bool(f"exists:{full_ecr_repo_name}"):
893
+ ecr_repo = ecr.Repository.from_repository_name(
894
+ self, "ECRRepo", repository_name=full_ecr_repo_name
895
+ )
896
+ print("Using existing ECR repository")
897
+ else:
898
+ ecr_repo = ecr.Repository(
899
+ self, "ECRRepo", repository_name=full_ecr_repo_name
900
+ ) # Explicitly set repository_name
901
+ print("Created ECR repository", full_ecr_repo_name)
902
+
903
+ ecr_image_loc = ecr_repo.repository_uri
904
+ except Exception as e:
905
+ raise Exception("Could not handle ECR repo due to:", e)
906
+
907
+ # --- CODEBUILD ---
908
+ try:
909
+ codebuild_project_name = CODEBUILD_PROJECT_NAME
910
+ if get_context_bool(f"exists:{codebuild_project_name}"):
911
+ # Lookup CodeBuild project by ARN from context
912
+ project_arn = get_context_str(f"arn:{codebuild_project_name}")
913
+ if not project_arn:
914
+ raise ValueError(
915
+ f"Context value 'arn:{codebuild_project_name}' is required if project exists."
916
+ )
917
+ codebuild_project = codebuild.Project.from_project_arn(
918
+ self, "CodeBuildProject", project_arn=project_arn
919
+ )
920
+ print("Using existing CodeBuild project")
921
+ else:
922
+ codebuild_project = codebuild.Project(
923
+ self,
924
+ "CodeBuildProject", # Logical ID
925
+ project_name=codebuild_project_name, # Explicit resource name
926
+ source=codebuild.Source.git_hub(
927
+ owner=GITHUB_REPO_USERNAME,
928
+ repo=GITHUB_REPO_NAME,
929
+ branch_or_ref=GITHUB_REPO_BRANCH,
930
+ ),
931
+ environment=codebuild.BuildEnvironment(
932
+ build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
933
+ privileged=True,
934
+ environment_variables={
935
+ "ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(
936
+ value=full_ecr_repo_name
937
+ ),
938
+ "AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(
939
+ value=AWS_REGION
940
+ ),
941
+ "AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(
942
+ value=AWS_ACCOUNT_ID
943
+ ),
944
+ "APP_MODE": codebuild.BuildEnvironmentVariable(
945
+ value="gradio"
946
+ ),
947
+ },
948
+ ),
949
+ build_spec=codebuild.BuildSpec.from_object(
950
+ {
951
+ "version": "0.2",
952
+ "phases": {
953
+ "pre_build": {
954
+ "commands": [
955
+ "echo Logging in to Amazon ECR",
956
+ "aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com",
957
+ ]
958
+ },
959
+ "build": {
960
+ "commands": [
961
+ "echo Building the Docker image",
962
+ "docker build --build-args APP_MODE=$APP_MODE --target $APP_MODE -t $ECR_REPO_NAME:latest .",
963
+ "docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
964
+ ]
965
+ },
966
+ "post_build": {
967
+ "commands": [
968
+ "echo Pushing the Docker image",
969
+ "docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest",
970
+ ]
971
+ },
972
+ },
973
+ }
974
+ ),
975
+ )
976
+ print("Successfully created CodeBuild project", codebuild_project_name)
977
+
978
+ # Grant permissions - applies to both created and imported project role
979
+ ecr_repo.grant_pull_push(codebuild_project.role)
980
+
981
+ except Exception as e:
982
+ raise Exception("Could not handle Codebuild project due to:", e)
983
+
984
+ # --- Security Groups ---
985
+ try:
986
+ ecs_security_group_name = ECS_SECURITY_GROUP_NAME
987
+
988
+ try:
989
+ ecs_security_group = ec2.SecurityGroup(
990
+ self,
991
+ "ECSSecurityGroup", # Logical ID
992
+ security_group_name=ecs_security_group_name, # Explicit resource name
993
+ vpc=vpc,
994
+ )
995
+ print(f"Created Security Group: {ecs_security_group_name}")
996
+ except Exception as e: # If lookup fails, create
997
+ print("Failed to create ECS security group due to:", e)
998
+
999
+ alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
1000
+
1001
+ try:
1002
+ alb_security_group = ec2.SecurityGroup(
1003
+ self,
1004
+ "ALBSecurityGroup", # Logical ID
1005
+ security_group_name=alb_security_group_name, # Explicit resource name
1006
+ vpc=vpc,
1007
+ )
1008
+ print(f"Created Security Group: {alb_security_group_name}")
1009
+ except Exception as e: # If lookup fails, create
1010
+ print("Failed to create ALB security group due to:", e)
1011
+
1012
+ # Define Ingress Rules - CDK will manage adding/removing these as needed
1013
+ ec2_port_gradio_server_port = ec2.Port.tcp(
1014
+ int(GRADIO_SERVER_PORT)
1015
+ ) # Ensure port is int
1016
+ ecs_security_group.add_ingress_rule(
1017
+ peer=alb_security_group,
1018
+ connection=ec2_port_gradio_server_port,
1019
+ description="ALB traffic",
1020
+ )
1021
+
1022
+ alb_security_group.add_ingress_rule(
1023
+ peer=ec2.Peer.prefix_list("pl-93a247fa"),
1024
+ connection=ec2.Port.all_traffic(),
1025
+ description="CloudFront traffic",
1026
+ )
1027
+
1028
+ except Exception as e:
1029
+ raise Exception("Could not handle security groups due to:", e)
1030
+
1031
+ # --- DynamoDB tables for logs (optional) ---
1032
+
1033
+ if SAVE_LOGS_TO_DYNAMODB == "True":
1034
+ try:
1035
+ print("Creating DynamoDB tables for logs")
1036
+
1037
+ dynamodb.Table(
1038
+ self,
1039
+ "RedactionAccessDataTable",
1040
+ table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
1041
+ partition_key=dynamodb.Attribute(
1042
+ name="id", type=dynamodb.AttributeType.STRING
1043
+ ),
1044
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
1045
+ removal_policy=RemovalPolicy.DESTROY,
1046
+ )
1047
+
1048
+ dynamodb.Table(
1049
+ self,
1050
+ "RedactionFeedbackDataTable",
1051
+ table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
1052
+ partition_key=dynamodb.Attribute(
1053
+ name="id", type=dynamodb.AttributeType.STRING
1054
+ ),
1055
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
1056
+ removal_policy=RemovalPolicy.DESTROY,
1057
+ )
1058
+
1059
+ dynamodb.Table(
1060
+ self,
1061
+ "RedactionUsageDataTable",
1062
+ table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
1063
+ partition_key=dynamodb.Attribute(
1064
+ name="id", type=dynamodb.AttributeType.STRING
1065
+ ),
1066
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
1067
+ removal_policy=RemovalPolicy.DESTROY,
1068
+ )
1069
+
1070
+ except Exception as e:
1071
+ raise Exception("Could not create DynamoDB tables due to:", e)
1072
+
1073
+ # --- ALB ---
1074
+ try:
1075
+ load_balancer_name = ALB_NAME
1076
+ if len(load_balancer_name) > 32:
1077
+ load_balancer_name = load_balancer_name[-32:]
1078
+ if get_context_bool(f"exists:{load_balancer_name}"):
1079
+ # Lookup ALB by ARN from context
1080
+ alb_arn = get_context_str(f"arn:{load_balancer_name}")
1081
+ if not alb_arn:
1082
+ raise ValueError(
1083
+ f"Context value 'arn:{load_balancer_name}' is required if ALB exists."
1084
+ )
1085
+ alb = elbv2.ApplicationLoadBalancer.from_lookup(
1086
+ self, "ALB", load_balancer_arn=alb_arn # Logical ID
1087
+ )
1088
+ print(f"Using existing Application Load Balancer {load_balancer_name}.")
1089
+ else:
1090
+ alb = elbv2.ApplicationLoadBalancer(
1091
+ self,
1092
+ "ALB", # Logical ID
1093
+ load_balancer_name=load_balancer_name, # Explicit resource name
1094
+ vpc=vpc,
1095
+ internet_facing=True,
1096
+ security_group=alb_security_group, # Link to SG
1097
+ vpc_subnets=public_subnet_selection, # Link to subnets
1098
+ )
1099
+ print("Successfully created new Application Load Balancer")
1100
+ except Exception as e:
1101
+ raise Exception("Could not handle application load balancer due to:", e)
1102
+
1103
+ # --- Cognito User Pool ---
1104
+ try:
1105
+ if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
1106
+ # Lookup by ID from context
1107
+ user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
1108
+ if not user_pool_id:
1109
+ raise ValueError(
1110
+ f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists."
1111
+ )
1112
+ user_pool = cognito.UserPool.from_user_pool_id(
1113
+ self, "UserPool", user_pool_id=user_pool_id
1114
+ )
1115
+ print(f"Using existing user pool {user_pool_id}.")
1116
+ else:
1117
+ user_pool = cognito.UserPool(
1118
+ self,
1119
+ "UserPool",
1120
+ user_pool_name=COGNITO_USER_POOL_NAME,
1121
+ mfa=cognito.Mfa.OFF, # Adjust as needed
1122
+ sign_in_aliases=cognito.SignInAliases(email=True),
1123
+ removal_policy=RemovalPolicy.DESTROY,
1124
+ ) # Adjust as needed
1125
+ print(f"Created new user pool {user_pool.user_pool_id}.")
1126
+
1127
+ # If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
1128
+ if ACM_SSL_CERTIFICATE_ARN:
1129
+ redirect_uris = [
1130
+ COGNITO_REDIRECTION_URL,
1131
+ COGNITO_REDIRECTION_URL + "/oauth2/idpresponse",
1132
+ ]
1133
+ else:
1134
+ redirect_uris = [COGNITO_REDIRECTION_URL]
1135
+
1136
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
1137
+ if get_context_bool(f"exists:{user_pool_client_name}"):
1138
+ # Lookup by ID from context (requires User Pool object)
1139
+ user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
1140
+ if not user_pool_client_id:
1141
+ raise ValueError(
1142
+ f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists."
1143
+ )
1144
+ user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(
1145
+ self, "UserPoolClient", user_pool_client_id=user_pool_client_id
1146
+ )
1147
+ print(f"Using existing user pool client {user_pool_client_id}.")
1148
+ else:
1149
+ user_pool_client = cognito.UserPoolClient(
1150
+ self,
1151
+ "UserPoolClient",
1152
+ auth_flows=cognito.AuthFlow(
1153
+ user_srp=True, user_password=True
1154
+ ), # Example: enable SRP for secure sign-in
1155
+ user_pool=user_pool,
1156
+ generate_secret=True,
1157
+ user_pool_client_name=user_pool_client_name,
1158
+ supported_identity_providers=[
1159
+ cognito.UserPoolClientIdentityProvider.COGNITO
1160
+ ],
1161
+ o_auth=cognito.OAuthSettings(
1162
+ flows=cognito.OAuthFlows(authorization_code_grant=True),
1163
+ scopes=[
1164
+ cognito.OAuthScope.OPENID,
1165
+ cognito.OAuthScope.EMAIL,
1166
+ cognito.OAuthScope.PROFILE,
1167
+ ],
1168
+ callback_urls=redirect_uris,
1169
+ ),
1170
+ refresh_token_validity=Duration.minutes(
1171
+ COGNITO_REFRESH_TOKEN_VALIDITY
1172
+ ),
1173
+ id_token_validity=Duration.minutes(COGNITO_ID_TOKEN_VALIDITY),
1174
+ access_token_validity=Duration.minutes(
1175
+ COGNITO_ACCESS_TOKEN_VALIDITY
1176
+ ),
1177
+ )
1178
+
1179
+ CfnOutput(
1180
+ self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id
1181
+ )
1182
+
1183
+ print(
1184
+ f"Created new user pool client {user_pool_client.user_pool_client_id}."
1185
+ )
1186
+
1187
+ # Add a domain to the User Pool (crucial for ALB integration)
1188
+ user_pool_domain = user_pool.add_domain(
1189
+ "UserPoolDomain",
1190
+ cognito_domain=cognito.CognitoDomainOptions(
1191
+ domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX
1192
+ ),
1193
+ )
1194
+
1195
+ # Apply removal_policy to the created UserPoolDomain construct
1196
+ user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
1197
+
1198
+ CfnOutput(
1199
+ self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url()
1200
+ )
1201
+
1202
+ except Exception as e:
1203
+ raise Exception("Could not handle Cognito resources due to:", e)
1204
+
1205
+ # --- Secrets Manager Secret ---
1206
+ try:
1207
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
1208
+ if get_context_bool(f"exists:{secret_name}"):
1209
+ # Lookup by name
1210
+ secret = secretsmanager.Secret.from_secret_name_v2(
1211
+ self, "CognitoSecret", secret_name=secret_name
1212
+ )
1213
+ print("Using existing Secret.")
1214
+ else:
1215
+ if USE_CUSTOM_KMS_KEY == "1" and isinstance(kms_key, kms.Key):
1216
+ secret = secretsmanager.Secret(
1217
+ self,
1218
+ "CognitoSecret", # Logical ID
1219
+ secret_name=secret_name, # Explicit resource name
1220
+ secret_object_value={
1221
+ "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
1222
+ user_pool.user_pool_id
1223
+ ), # Use the CDK attribute
1224
+ "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
1225
+ user_pool_client.user_pool_client_id
1226
+ ), # Use the CDK attribute
1227
+ "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
1228
+ },
1229
+ encryption_key=kms_key,
1230
+ )
1231
+ else:
1232
+ secret = secretsmanager.Secret(
1233
+ self,
1234
+ "CognitoSecret", # Logical ID
1235
+ secret_name=secret_name, # Explicit resource name
1236
+ secret_object_value={
1237
+ "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(
1238
+ user_pool.user_pool_id
1239
+ ), # Use the CDK attribute
1240
+ "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(
1241
+ user_pool_client.user_pool_client_id
1242
+ ), # Use the CDK attribute
1243
+ "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret, # Use the CDK attribute
1244
+ },
1245
+ )
1246
+
1247
+ print(
1248
+ "Created new secret in Secrets Manager for Cognito user pool and related details."
1249
+ )
1250
+
1251
+ except Exception as e:
1252
+ raise Exception("Could not handle Secrets Manager secret due to:", e)
1253
+
1254
+ # --- Fargate Task Definition ---
1255
+ try:
1256
+ fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
1257
+
1258
+ read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == "True"
1259
+
1260
+ if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
1261
+ with open(TASK_DEFINITION_FILE_LOCATION) as f: # Use correct path
1262
+ task_def_params = json.load(f)
1263
+ # Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
1264
+ else:
1265
+ epheremal_storage_volume_name = "appEphemeralVolume"
1266
+
1267
+ task_def_params = {}
1268
+ task_def_params["taskRoleArn"] = (
1269
+ task_role.role_arn
1270
+ ) # Use CDK role object ARN
1271
+ task_def_params["executionRoleArn"] = (
1272
+ execution_role.role_arn
1273
+ ) # Use CDK role object ARN
1274
+ task_def_params["memory"] = ECS_TASK_MEMORY_SIZE
1275
+ task_def_params["cpu"] = ECS_TASK_CPU_SIZE
1276
+ container_def = {
1277
+ "name": full_ecr_repo_name,
1278
+ "image": ecr_image_loc + ":latest",
1279
+ "essential": True,
1280
+ "portMappings": [
1281
+ {
1282
+ "containerPort": int(GRADIO_SERVER_PORT),
1283
+ "hostPort": int(GRADIO_SERVER_PORT),
1284
+ "protocol": "tcp",
1285
+ "appProtocol": "http",
1286
+ }
1287
+ ],
1288
+ "logConfiguration": {
1289
+ "logDriver": "awslogs",
1290
+ "options": {
1291
+ "awslogs-group": ECS_LOG_GROUP_NAME,
1292
+ "awslogs-region": AWS_REGION,
1293
+ "awslogs-stream-prefix": "ecs",
1294
+ },
1295
+ },
1296
+ "environmentFiles": [
1297
+ {"value": bucket.bucket_arn + "/config.env", "type": "s3"}
1298
+ ],
1299
+ "memoryReservation": int(task_def_params["memory"])
1300
+ - 512, # Reserve some memory for the container
1301
+ "mountPoints": [
1302
+ {
1303
+ "sourceVolume": epheremal_storage_volume_name,
1304
+ "containerPath": "/home/user/app/logs",
1305
+ "readOnly": False,
1306
+ },
1307
+ {
1308
+ "sourceVolume": epheremal_storage_volume_name,
1309
+ "containerPath": "/home/user/app/feedback",
1310
+ "readOnly": False,
1311
+ },
1312
+ {
1313
+ "sourceVolume": epheremal_storage_volume_name,
1314
+ "containerPath": "/home/user/app/usage",
1315
+ "readOnly": False,
1316
+ },
1317
+ {
1318
+ "sourceVolume": epheremal_storage_volume_name,
1319
+ "containerPath": "/home/user/app/input",
1320
+ "readOnly": False,
1321
+ },
1322
+ {
1323
+ "sourceVolume": epheremal_storage_volume_name,
1324
+ "containerPath": "/home/user/app/output",
1325
+ "readOnly": False,
1326
+ },
1327
+ {
1328
+ "sourceVolume": epheremal_storage_volume_name,
1329
+ "containerPath": "/home/user/app/tmp",
1330
+ "readOnly": False,
1331
+ },
1332
+ {
1333
+ "sourceVolume": epheremal_storage_volume_name,
1334
+ "containerPath": "/home/user/app/config",
1335
+ "readOnly": False,
1336
+ },
1337
+ {
1338
+ "sourceVolume": epheremal_storage_volume_name,
1339
+ "containerPath": "/tmp/matplotlib_cache",
1340
+ "readOnly": False,
1341
+ },
1342
+ {
1343
+ "sourceVolume": epheremal_storage_volume_name,
1344
+ "containerPath": "/tmp",
1345
+ "readOnly": False,
1346
+ },
1347
+ {
1348
+ "sourceVolume": epheremal_storage_volume_name,
1349
+ "containerPath": "/var/tmp",
1350
+ "readOnly": False,
1351
+ },
1352
+ {
1353
+ "sourceVolume": epheremal_storage_volume_name,
1354
+ "containerPath": "/tmp/tld",
1355
+ "readOnly": False,
1356
+ },
1357
+ {
1358
+ "sourceVolume": epheremal_storage_volume_name,
1359
+ "containerPath": "/tmp/gradio_tmp",
1360
+ "readOnly": False,
1361
+ },
1362
+ {
1363
+ "sourceVolume": epheremal_storage_volume_name,
1364
+ "containerPath": "/home/user/.paddlex",
1365
+ "readOnly": False,
1366
+ },
1367
+ {
1368
+ "sourceVolume": epheremal_storage_volume_name,
1369
+ "containerPath": "/home/user/.local/share/spacy/data",
1370
+ "readOnly": False,
1371
+ },
1372
+ {
1373
+ "sourceVolume": epheremal_storage_volume_name,
1374
+ "containerPath": "/usr/share/tessdata",
1375
+ "readOnly": False,
1376
+ },
1377
+ ],
1378
+ "readonlyRootFilesystem": read_only_file_system,
1379
+ }
1380
+ task_def_params["containerDefinitions"] = [container_def]
1381
+
1382
+ log_group_name_from_config = task_def_params["containerDefinitions"][0][
1383
+ "logConfiguration"
1384
+ ]["options"]["awslogs-group"]
1385
+
1386
+ cdk_managed_log_group = logs.LogGroup(
1387
+ self,
1388
+ "MyTaskLogGroup", # CDK Logical ID
1389
+ log_group_name=log_group_name_from_config,
1390
+ retention=logs.RetentionDays.ONE_MONTH,
1391
+ removal_policy=RemovalPolicy.DESTROY,
1392
+ )
1393
+
1394
+ epheremal_storage_volume_cdk_obj = ecs.Volume(
1395
+ name=epheremal_storage_volume_name
1396
+ )
1397
+
1398
+ fargate_task_definition = ecs.FargateTaskDefinition(
1399
+ self,
1400
+ "FargateTaskDefinition", # Logical ID
1401
+ family=fargate_task_definition_name,
1402
+ cpu=int(task_def_params["cpu"]),
1403
+ memory_limit_mib=int(task_def_params["memory"]),
1404
+ task_role=task_role,
1405
+ execution_role=execution_role,
1406
+ runtime_platform=ecs.RuntimePlatform(
1407
+ cpu_architecture=ecs.CpuArchitecture.X86_64,
1408
+ operating_system_family=ecs.OperatingSystemFamily.LINUX,
1409
+ ),
1410
+ ephemeral_storage_gib=21, # Minimum is 21 GiB
1411
+ volumes=[epheremal_storage_volume_cdk_obj],
1412
+ )
1413
+ print("Fargate task definition defined.")
1414
+
1415
+ # Add container definitions to the task definition object
1416
+ if task_def_params["containerDefinitions"]:
1417
+ container_def_params = task_def_params["containerDefinitions"][0]
1418
+
1419
+ if container_def_params.get("environmentFiles"):
1420
+ env_files = []
1421
+ for env_file_param in container_def_params["environmentFiles"]:
1422
+ # Need to parse the ARN to get the bucket object and key
1423
+ env_file_arn_parts = env_file_param["value"].split(":::")
1424
+ bucket_name_and_key = env_file_arn_parts[-1]
1425
+ env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
1426
+
1427
+ env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
1428
+
1429
+ env_files.append(env_file)
1430
+
1431
+ container = fargate_task_definition.add_container(
1432
+ container_def_params["name"],
1433
+ image=ecs.ContainerImage.from_registry(
1434
+ container_def_params["image"]
1435
+ ),
1436
+ logging=ecs.LogDriver.aws_logs(
1437
+ stream_prefix=container_def_params["logConfiguration"][
1438
+ "options"
1439
+ ]["awslogs-stream-prefix"],
1440
+ log_group=cdk_managed_log_group,
1441
+ ),
1442
+ secrets={
1443
+ "AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(
1444
+ secret, "REDACTION_USER_POOL_ID"
1445
+ ),
1446
+ "AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(
1447
+ secret, "REDACTION_CLIENT_ID"
1448
+ ),
1449
+ "AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(
1450
+ secret, "REDACTION_CLIENT_SECRET"
1451
+ ),
1452
+ },
1453
+ environment_files=env_files,
1454
+ readonly_root_filesystem=read_only_file_system,
1455
+ )
1456
+
1457
+ for port_mapping in container_def_params["portMappings"]:
1458
+ container.add_port_mappings(
1459
+ ecs.PortMapping(
1460
+ container_port=int(port_mapping["containerPort"]),
1461
+ host_port=int(port_mapping["hostPort"]),
1462
+ name="port-" + str(port_mapping["containerPort"]),
1463
+ app_protocol=ecs.AppProtocol.http,
1464
+ protocol=ecs.Protocol.TCP,
1465
+ )
1466
+ )
1467
+
1468
+ container.add_port_mappings(
1469
+ ecs.PortMapping(
1470
+ container_port=80,
1471
+ host_port=80,
1472
+ name="port-80",
1473
+ app_protocol=ecs.AppProtocol.http,
1474
+ protocol=ecs.Protocol.TCP,
1475
+ )
1476
+ )
1477
+
1478
+ if container_def_params.get("mountPoints"):
1479
+ mount_points = []
1480
+ for mount_point in container_def_params["mountPoints"]:
1481
+ mount_points.append(
1482
+ ecs.MountPoint(
1483
+ container_path=mount_point["containerPath"],
1484
+ read_only=mount_point["readOnly"],
1485
+ source_volume=epheremal_storage_volume_name,
1486
+ )
1487
+ )
1488
+ container.add_mount_points(*mount_points)
1489
+
1490
+ except Exception as e:
1491
+ raise Exception("Could not handle Fargate task definition due to:", e)
1492
+
1493
+ # --- ECS Cluster ---
1494
+ try:
1495
+ cluster = ecs.Cluster(
1496
+ self,
1497
+ "ECSCluster", # Logical ID
1498
+ cluster_name=CLUSTER_NAME, # Explicit resource name
1499
+ enable_fargate_capacity_providers=True,
1500
+ vpc=vpc,
1501
+ )
1502
+ print("Successfully created new ECS cluster")
1503
+ except Exception as e:
1504
+ raise Exception("Could not handle ECS cluster due to:", e)
1505
+
1506
+ # --- ECS Service ---
1507
+ try:
1508
+ ecs_service_name = ECS_SERVICE_NAME
1509
+
1510
+ if ECS_USE_FARGATE_SPOT == "True":
1511
+ use_fargate_spot = "FARGATE_SPOT"
1512
+ if ECS_USE_FARGATE_SPOT == "False":
1513
+ use_fargate_spot = "FARGATE"
1514
+
1515
+ # Check if service exists - from_service_arn or from_service_name (needs cluster)
1516
+ try:
1517
+ # from_service_name is useful if you have the cluster object
1518
+ ecs_service = ecs.FargateService.from_service_attributes(
1519
+ self,
1520
+ "ECSService", # Logical ID
1521
+ cluster=cluster, # Requires the cluster object
1522
+ service_name=ecs_service_name,
1523
+ )
1524
+ print(f"Using existing ECS service {ecs_service_name}.")
1525
+ except Exception:
1526
+ # Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
1527
+ ecs_service = ecs.FargateService(
1528
+ self,
1529
+ "ECSService", # Logical ID
1530
+ service_name=ecs_service_name, # Explicit resource name
1531
+ platform_version=ecs.FargatePlatformVersion.LATEST,
1532
+ capacity_provider_strategies=[
1533
+ ecs.CapacityProviderStrategy(
1534
+ capacity_provider=use_fargate_spot, base=0, weight=1
1535
+ )
1536
+ ],
1537
+ cluster=cluster,
1538
+ task_definition=fargate_task_definition, # Link to TD
1539
+ security_groups=[ecs_security_group], # Link to SG
1540
+ vpc_subnets=ec2.SubnetSelection(
1541
+ subnets=self.private_subnets
1542
+ ), # Link to subnets
1543
+ min_healthy_percent=0,
1544
+ max_healthy_percent=100,
1545
+ desired_count=0,
1546
+ )
1547
+ print("Successfully created new ECS service")
1548
+
1549
+ # Note: Auto-scaling setup would typically go here if needed for the service
1550
+
1551
+ except Exception as e:
1552
+ raise Exception("Could not handle ECS service due to:", e)
1553
+
1554
+ # --- Grant Secret Read Access (Applies to both created and imported roles) ---
1555
+ try:
1556
+ secret.grant_read(task_role)
1557
+ secret.grant_read(execution_role)
1558
+ except Exception as e:
1559
+ raise Exception("Could not grant access to Secrets Manager due to:", e)
1560
+
1561
+ # --- ALB TARGET GROUPS AND LISTENERS ---
1562
+ # This section should primarily define the resources if they are managed by this stack.
1563
+ # CDK handles adding/removing targets and actions on updates.
1564
+ # If they might pre-exist outside the stack, you need lookups.
1565
+ cookie_duration = Duration.hours(12)
1566
+ target_group_name = ALB_TARGET_GROUP_NAME # Explicit resource name
1567
+ cloudfront_distribution_url = "cloudfront_placeholder.net" # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
1568
+
1569
+ try:
1570
+ # --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
1571
+
1572
+ target_group = elbv2.ApplicationTargetGroup(
1573
+ self,
1574
+ "AppTargetGroup", # Logical ID
1575
+ target_group_name=target_group_name, # Explicit resource name
1576
+ port=int(GRADIO_SERVER_PORT), # Ensure port is int
1577
+ protocol=elbv2.ApplicationProtocol.HTTP,
1578
+ targets=[ecs_service], # Link to ECS Service
1579
+ stickiness_cookie_duration=cookie_duration,
1580
+ vpc=vpc, # Target Groups need VPC
1581
+ )
1582
+ print(f"ALB target group {target_group_name} defined.")
1583
+
1584
+ # First HTTP
1585
+ listener_port = 80
1586
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
1587
+
1588
+ http_listener = alb.add_listener(
1589
+ "HttpListener", # Logical ID
1590
+ port=listener_port,
1591
+ open=False, # Be cautious with open=True, usually restrict source SG
1592
+ )
1593
+ print(f"ALB listener on port {listener_port} defined.")
1594
+
1595
+ if ACM_SSL_CERTIFICATE_ARN:
1596
+ http_listener.add_action(
1597
+ "DefaultAction", # Logical ID for the default action
1598
+ action=elbv2.ListenerAction.redirect(
1599
+ protocol="HTTPS",
1600
+ host="#{host}",
1601
+ port="443",
1602
+ path="/#{path}",
1603
+ query="#{query}",
1604
+ ),
1605
+ )
1606
+ else:
1607
+ if USE_CLOUDFRONT == "True":
1608
+
1609
+ # The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
1610
+ http_listener.add_action(
1611
+ "DefaultAction", # Logical ID for the default action
1612
+ action=elbv2.ListenerAction.fixed_response(
1613
+ status_code=403,
1614
+ content_type="text/plain",
1615
+ message_body="Access denied",
1616
+ ),
1617
+ )
1618
+
1619
+ # Add the Listener Rule for the specific CloudFront Host Header
1620
+ http_listener.add_action(
1621
+ "CloudFrontHostHeaderRule",
1622
+ action=elbv2.ListenerAction.forward(
1623
+ target_groups=[target_group],
1624
+ stickiness_duration=cookie_duration,
1625
+ ),
1626
+ priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
1627
+ conditions=[
1628
+ elbv2.ListenerCondition.host_headers(
1629
+ [cloudfront_distribution_url]
1630
+ ) # May have to redefine url in console afterwards if not specified in config file
1631
+ ],
1632
+ )
1633
+
1634
+ else:
1635
+ # Add the Listener Rule for the specific CloudFront Host Header
1636
+ http_listener.add_action(
1637
+ "CloudFrontHostHeaderRule",
1638
+ action=elbv2.ListenerAction.forward(
1639
+ target_groups=[target_group],
1640
+ stickiness_duration=cookie_duration,
1641
+ ),
1642
+ )
1643
+
1644
+ print("Added targets and actions to ALB HTTP listener.")
1645
+
1646
+ # Now the same for HTTPS if you have an ACM certificate
1647
+ if ACM_SSL_CERTIFICATE_ARN:
1648
+ listener_port_https = 443
1649
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
1650
+
1651
+ https_listener = add_alb_https_listener_with_cert(
1652
+ self,
1653
+ "MyHttpsListener", # Logical ID for the HTTPS listener
1654
+ alb,
1655
+ acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
1656
+ default_target_group=target_group,
1657
+ enable_cognito_auth=True,
1658
+ cognito_user_pool=user_pool,
1659
+ cognito_user_pool_client=user_pool_client,
1660
+ cognito_user_pool_domain=user_pool_domain,
1661
+ listener_open_to_internet=True,
1662
+ stickiness_cookie_duration=cookie_duration,
1663
+ )
1664
+
1665
+ if https_listener:
1666
+ CfnOutput(
1667
+ self, "HttpsListenerArn", value=https_listener.listener_arn
1668
+ )
1669
+
1670
+ print(f"ALB listener on port {listener_port_https} defined.")
1671
+
1672
+ # if USE_CLOUDFRONT == 'True':
1673
+ # # Add default action to the listener
1674
+ # https_listener.add_action(
1675
+ # "DefaultAction", # Logical ID for the default action
1676
+ # action=elbv2.ListenerAction.fixed_response(
1677
+ # status_code=403,
1678
+ # content_type="text/plain",
1679
+ # message_body="Access denied",
1680
+ # ),
1681
+ # )
1682
+
1683
+ # # Add the Listener Rule for the specific CloudFront Host Header
1684
+ # https_listener.add_action(
1685
+ # "CloudFrontHostHeaderRuleHTTPS",
1686
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
1687
+ # priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
1688
+ # conditions=[
1689
+ # elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
1690
+ # ]
1691
+ # )
1692
+ # else:
1693
+ # https_listener.add_action(
1694
+ # "CloudFrontHostHeaderRuleHTTPS",
1695
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
1696
+
1697
+ print("Added targets and actions to ALB HTTPS listener.")
1698
+
1699
+ except Exception as e:
1700
+ raise Exception(
1701
+ "Could not handle ALB target groups and listeners due to:", e
1702
+ )
1703
+
1704
+ # Create WAF to attach to load balancer
1705
+ try:
1706
+ web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
1707
+ if get_context_bool(f"exists:{web_acl_name}"):
1708
+ # Lookup WAF ACL by ARN from context
1709
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
1710
+ if not web_acl_arn:
1711
+ raise ValueError(
1712
+ f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
1713
+ )
1714
+
1715
+ web_acl = create_web_acl_with_common_rules(
1716
+ self, web_acl_name, waf_scope="REGIONAL"
1717
+ ) # Assuming it takes scope and name
1718
+ print(f"Handled ALB WAF web ACL {web_acl_name}.")
1719
+ else:
1720
+ web_acl = create_web_acl_with_common_rules(
1721
+ self, web_acl_name, waf_scope="REGIONAL"
1722
+ ) # Assuming it takes scope and name
1723
+ print(f"Created ALB WAF web ACL {web_acl_name}.")
1724
+
1725
+ wafv2.CfnWebACLAssociation(
1726
+ self,
1727
+ id="alb_waf_association",
1728
+ resource_arn=alb.load_balancer_arn,
1729
+ web_acl_arn=web_acl.attr_arn,
1730
+ )
1731
+
1732
+ except Exception as e:
1733
+ raise Exception("Could not handle create ALB WAF web ACL due to:", e)
1734
+
1735
+ # --- Outputs for other stacks/regions ---
1736
+
1737
+ self.params = dict()
1738
+ self.params["alb_arn_output"] = alb.load_balancer_arn
1739
+ self.params["alb_security_group_id"] = alb_security_group.security_group_id
1740
+ self.params["alb_dns_name"] = alb.load_balancer_dns_name
1741
+
1742
+ CfnOutput(
1743
+ self,
1744
+ "AlbArnOutput",
1745
+ value=alb.load_balancer_arn,
1746
+ description="ARN of the Application Load Balancer",
1747
+ export_name=f"{self.stack_name}-AlbArn",
1748
+ ) # Export name must be unique within the account/region
1749
+
1750
+ CfnOutput(
1751
+ self,
1752
+ "AlbSecurityGroupIdOutput",
1753
+ value=alb_security_group.security_group_id,
1754
+ description="ID of the ALB's Security Group",
1755
+ export_name=f"{self.stack_name}-AlbSgId",
1756
+ )
1757
+ CfnOutput(self, "ALBName", value=alb.load_balancer_name)
1758
+
1759
+ CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
1760
+
1761
+ CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
1762
+ # Add other outputs if needed
1763
+
1764
+ CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
1765
+
1766
+
1767
+ # --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
1768
+ class CdkStackCloudfront(Stack):
1769
+
1770
+ def __init__(
1771
+ self,
1772
+ scope: Construct,
1773
+ construct_id: str,
1774
+ alb_arn: str,
1775
+ alb_sec_group_id: str,
1776
+ alb_dns_name: str,
1777
+ **kwargs,
1778
+ ) -> None:
1779
+ super().__init__(scope, construct_id, **kwargs)
1780
+
1781
+ # --- Helper to get context values ---
1782
+ def get_context_bool(key: str, default: bool = False) -> bool:
1783
+ return self.node.try_get_context(key) or default
1784
+
1785
+ def get_context_str(key: str, default: str = None) -> str:
1786
+ return self.node.try_get_context(key) or default
1787
+
1788
+ def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
1789
+ return scope.node.try_get_context(key) or default
1790
+
1791
+ print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
1792
+ print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
1793
+
1794
+ if not alb_arn:
1795
+ raise ValueError("ALB ARN must be provided to CloudFront stack")
1796
+ if not alb_sec_group_id:
1797
+ raise ValueError(
1798
+ "ALB Security Group ID must be provided to CloudFront stack"
1799
+ )
1800
+
1801
+ # 2. Import the ALB using its ARN
1802
+ # This imports an existing ALB as a construct in the CloudFront stack's context.
1803
+ # CloudFormation will understand this reference at deploy time.
1804
+ alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
1805
+ self,
1806
+ "ImportedAlb",
1807
+ load_balancer_arn=alb_arn,
1808
+ security_group_id=alb_sec_group_id,
1809
+ load_balancer_dns_name=alb_dns_name,
1810
+ )
1811
+
1812
+ try:
1813
+ web_acl_name = WEB_ACL_NAME
1814
+ if get_context_bool(f"exists:{web_acl_name}"):
1815
+ # Lookup WAF ACL by ARN from context
1816
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
1817
+ if not web_acl_arn:
1818
+ raise ValueError(
1819
+ f"Context value 'arn:{web_acl_name}' is required if Web ACL exists."
1820
+ )
1821
+
1822
+ web_acl = create_web_acl_with_common_rules(
1823
+ self, web_acl_name
1824
+ ) # Assuming it takes scope and name
1825
+ print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
1826
+ else:
1827
+ web_acl = create_web_acl_with_common_rules(
1828
+ self, web_acl_name
1829
+ ) # Assuming it takes scope and name
1830
+ print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
1831
+
1832
+ # Add ALB as CloudFront Origin
1833
+ origin = origins.LoadBalancerV2Origin(
1834
+ alb, # Use the created or looked-up ALB object
1835
+ custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
1836
+ origin_shield_enabled=False,
1837
+ protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
1838
+ )
1839
+
1840
+ if CLOUDFRONT_GEO_RESTRICTION:
1841
+ geo_restrict = cloudfront.GeoRestriction.allowlist(
1842
+ CLOUDFRONT_GEO_RESTRICTION
1843
+ )
1844
+ else:
1845
+ geo_restrict = None
1846
+
1847
+ cloudfront_distribution = cloudfront.Distribution(
1848
+ self,
1849
+ "CloudFrontDistribution", # Logical ID
1850
+ comment=CLOUDFRONT_DISTRIBUTION_NAME, # Use name as comment for easier identification
1851
+ geo_restriction=geo_restrict,
1852
+ default_behavior=cloudfront.BehaviorOptions(
1853
+ origin=origin,
1854
+ viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
1855
+ allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
1856
+ cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
1857
+ origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
1858
+ ),
1859
+ web_acl_id=web_acl.attr_arn,
1860
+ )
1861
+ print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
1862
+
1863
+ except Exception as e:
1864
+ raise Exception("Could not handle Cloudfront distribution due to:", e)
1865
+
1866
+ # --- Outputs ---
1867
+ CfnOutput(
1868
+ self, "CloudFrontDistributionURL", value=cloudfront_distribution.domain_name
1869
+ )
cdk/check_resources.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Any, Dict, List
4
+
5
+ from cdk_config import ( # Import necessary config
6
+ ALB_NAME,
7
+ AWS_REGION,
8
+ CDK_CONFIG_PATH,
9
+ CDK_FOLDER,
10
+ CODEBUILD_PROJECT_NAME,
11
+ CODEBUILD_ROLE_NAME,
12
+ COGNITO_USER_POOL_CLIENT_NAME,
13
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME,
14
+ COGNITO_USER_POOL_NAME,
15
+ CONTEXT_FILE,
16
+ ECR_CDK_REPO_NAME,
17
+ ECS_TASK_EXECUTION_ROLE_NAME,
18
+ ECS_TASK_ROLE_NAME,
19
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
20
+ PRIVATE_SUBNET_CIDR_BLOCKS,
21
+ PRIVATE_SUBNETS_TO_USE,
22
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
23
+ PUBLIC_SUBNET_CIDR_BLOCKS,
24
+ PUBLIC_SUBNETS_TO_USE,
25
+ S3_LOG_CONFIG_BUCKET_NAME,
26
+ S3_OUTPUT_BUCKET_NAME,
27
+ VPC_NAME,
28
+ WEB_ACL_NAME,
29
+ )
30
+ from cdk_functions import ( # Import your check functions (assuming they use Boto3)
31
+ _get_existing_subnets_in_vpc,
32
+ check_alb_exists,
33
+ check_codebuild_project_exists,
34
+ check_ecr_repo_exists,
35
+ check_for_existing_role,
36
+ check_for_existing_user_pool,
37
+ check_for_existing_user_pool_client,
38
+ check_for_secret,
39
+ check_s3_bucket_exists,
40
+ check_subnet_exists_by_name,
41
+ check_web_acl_exists,
42
+ get_vpc_id_by_name,
43
+ validate_subnet_creation_parameters,
44
+ # Add other check functions as needed
45
+ )
46
+
47
+ cdk_folder = CDK_FOLDER # <FULL_PATH_TO_CDK_FOLDER_HERE>
48
+
49
+ # Full path needed to find config file
50
+ os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
51
+
52
+
53
+ # --- Helper to parse environment variables into lists ---
54
+ def _get_env_list(env_var_name: str) -> List[str]:
55
+ """Parses a comma-separated environment variable into a list of strings."""
56
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
57
+ if not value:
58
+ return []
59
+ # Split by comma and filter out any empty strings that might result from extra commas
60
+ return [s.strip() for s in value.split(",") if s.strip()]
61
+
62
+
63
+ if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list):
64
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
65
+ if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list):
66
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
67
+ if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list):
68
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
69
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(
70
+ PUBLIC_SUBNET_AVAILABILITY_ZONES, list
71
+ ):
72
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
73
+ if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list):
74
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
75
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(
76
+ PRIVATE_SUBNET_AVAILABILITY_ZONES, list
77
+ ):
78
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
79
+
80
+ # Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
81
+
82
+
83
+ def check_and_set_context():
84
+ context_data = {}
85
+
86
+ # --- Find the VPC ID first ---
87
+ if VPC_NAME:
88
+ print("VPC_NAME:", VPC_NAME)
89
+ vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
90
+
91
+ # If you expect only one, or one per AZ and you're creating one per AZ in CDK:
92
+ if nat_gateways:
93
+ # For simplicity, let's just check if *any* NAT exists in the VPC
94
+ # A more robust check would match by subnet, AZ, or a specific tag.
95
+ context_data["exists:NatGateway"] = True
96
+ context_data["id:NatGateway"] = nat_gateways[0][
97
+ "NatGatewayId"
98
+ ] # Store the ID of the first one found
99
+ else:
100
+ context_data["exists:NatGateway"] = False
101
+ context_data["id:NatGateway"] = None
102
+
103
+ if not vpc_id:
104
+ # If the VPC doesn't exist, you might not be able to check/create subnets.
105
+ # Decide how to handle this: raise an error, set a flag, etc.
106
+ raise RuntimeError(
107
+ f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks."
108
+ )
109
+
110
+ context_data["vpc_id"] = vpc_id # Store VPC ID in context
111
+
112
+ # SUBNET CHECKS
113
+ context_data: Dict[str, Any] = {}
114
+ all_proposed_subnets_data: List[Dict[str, str]] = []
115
+
116
+ # Flag to indicate if full validation mode (with CIDR/AZs) is active
117
+ full_validation_mode = False
118
+
119
+ # Determine if full validation mode is possible/desired
120
+ # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
121
+ public_ready_for_full_validation = (
122
+ len(PUBLIC_SUBNETS_TO_USE) > 0
123
+ and len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE)
124
+ and len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
125
+ )
126
+ private_ready_for_full_validation = (
127
+ len(PRIVATE_SUBNETS_TO_USE) > 0
128
+ and len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE)
129
+ and len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
130
+ )
131
+
132
+ # Activate full validation if *any* type of subnet (public or private) has its full details provided.
133
+ # You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
134
+ if public_ready_for_full_validation or private_ready_for_full_validation:
135
+ full_validation_mode = True
136
+
137
+ # If some are ready but others aren't, print a warning or raise an error based on your strictness
138
+ if (
139
+ public_ready_for_full_validation
140
+ and not private_ready_for_full_validation
141
+ and PRIVATE_SUBNETS_TO_USE
142
+ ):
143
+ print(
144
+ "Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs."
145
+ )
146
+ if (
147
+ private_ready_for_full_validation
148
+ and not public_ready_for_full_validation
149
+ and PUBLIC_SUBNETS_TO_USE
150
+ ):
151
+ print(
152
+ "Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs."
153
+ )
154
+
155
+ # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
156
+ if public_ready_for_full_validation:
157
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
158
+ all_proposed_subnets_data.append(
159
+ {
160
+ "name": name,
161
+ "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
162
+ "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
163
+ }
164
+ )
165
+ if private_ready_for_full_validation:
166
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
167
+ all_proposed_subnets_data.append(
168
+ {
169
+ "name": name,
170
+ "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
171
+ "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
172
+ }
173
+ )
174
+
175
+ print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
176
+
177
+ # Fetch all existing subnets in the target VPC once to avoid repeated API calls
178
+ try:
179
+ existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
180
+ except Exception as e:
181
+ print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
182
+ raise SystemExit(1) # Exit immediately if we can't get baseline data
183
+
184
+ print("\n--- Running Name-Only Subnet Existence Check Mode ---")
185
+ # Fallback: check only by name using the existing data
186
+ checked_public_subnets = {}
187
+ if PUBLIC_SUBNETS_TO_USE:
188
+ for subnet_name in PUBLIC_SUBNETS_TO_USE:
189
+ print("subnet_name:", subnet_name)
190
+ exists, subnet_id = check_subnet_exists_by_name(
191
+ subnet_name, existing_aws_subnets
192
+ )
193
+ checked_public_subnets[subnet_name] = {
194
+ "exists": exists,
195
+ "id": subnet_id,
196
+ }
197
+
198
+ # If the subnet exists, remove it from the proposed subnets list
199
+ if checked_public_subnets[subnet_name]["exists"] is True:
200
+ all_proposed_subnets_data = [
201
+ subnet
202
+ for subnet in all_proposed_subnets_data
203
+ if subnet["name"] != subnet_name
204
+ ]
205
+
206
+ context_data["checked_public_subnets"] = checked_public_subnets
207
+
208
+ checked_private_subnets = {}
209
+ if PRIVATE_SUBNETS_TO_USE:
210
+ for subnet_name in PRIVATE_SUBNETS_TO_USE:
211
+ print("subnet_name:", subnet_name)
212
+ exists, subnet_id = check_subnet_exists_by_name(
213
+ subnet_name, existing_aws_subnets
214
+ )
215
+ checked_private_subnets[subnet_name] = {
216
+ "exists": exists,
217
+ "id": subnet_id,
218
+ }
219
+
220
+ # If the subnet exists, remove it from the proposed subnets list
221
+ if checked_private_subnets[subnet_name]["exists"] is True:
222
+ all_proposed_subnets_data = [
223
+ subnet
224
+ for subnet in all_proposed_subnets_data
225
+ if subnet["name"] != subnet_name
226
+ ]
227
+
228
+ context_data["checked_private_subnets"] = checked_private_subnets
229
+
230
+ print("\nName-only existence subnet check complete.\n")
231
+
232
+ if full_validation_mode:
233
+ print(
234
+ "\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---"
235
+ )
236
+ try:
237
+ validate_subnet_creation_parameters(
238
+ vpc_id, all_proposed_subnets_data, existing_aws_subnets
239
+ )
240
+ print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
241
+
242
+ # Populate context_data for downstream CDK construct creation
243
+ context_data["public_subnets_to_create"] = []
244
+ if public_ready_for_full_validation:
245
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
246
+ context_data["public_subnets_to_create"].append(
247
+ {
248
+ "name": name,
249
+ "cidr": PUBLIC_SUBNET_CIDR_BLOCKS[i],
250
+ "az": PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
251
+ "is_public": True,
252
+ }
253
+ )
254
+ context_data["private_subnets_to_create"] = []
255
+ if private_ready_for_full_validation:
256
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
257
+ context_data["private_subnets_to_create"].append(
258
+ {
259
+ "name": name,
260
+ "cidr": PRIVATE_SUBNET_CIDR_BLOCKS[i],
261
+ "az": PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
262
+ "is_public": False,
263
+ }
264
+ )
265
+
266
+ except (ValueError, Exception) as e:
267
+ print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
268
+ raise SystemExit(1) # Exit if validation fails
269
+
270
+ # Example checks and setting context values
271
+ # IAM Roles
272
+ role_name = CODEBUILD_ROLE_NAME
273
+ exists, _, _ = check_for_existing_role(role_name)
274
+ context_data[f"exists:{role_name}"] = exists # Use boolean
275
+ if exists:
276
+ _, role_arn, _ = check_for_existing_role(role_name) # Get ARN if needed
277
+ context_data[f"arn:{role_name}"] = role_arn
278
+
279
+ role_name = ECS_TASK_ROLE_NAME
280
+ exists, _, _ = check_for_existing_role(role_name)
281
+ context_data[f"exists:{role_name}"] = exists
282
+ if exists:
283
+ _, role_arn, _ = check_for_existing_role(role_name)
284
+ context_data[f"arn:{role_name}"] = role_arn
285
+
286
+ role_name = ECS_TASK_EXECUTION_ROLE_NAME
287
+ exists, _, _ = check_for_existing_role(role_name)
288
+ context_data[f"exists:{role_name}"] = exists
289
+ if exists:
290
+ _, role_arn, _ = check_for_existing_role(role_name)
291
+ context_data[f"arn:{role_name}"] = role_arn
292
+
293
+ # S3 Buckets
294
+ bucket_name = S3_LOG_CONFIG_BUCKET_NAME
295
+ exists, _ = check_s3_bucket_exists(bucket_name)
296
+ context_data[f"exists:{bucket_name}"] = exists
297
+ if exists:
298
+ # You might not need the ARN if using from_bucket_name
299
+ pass
300
+
301
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
302
+ exists, _ = check_s3_bucket_exists(output_bucket_name)
303
+ context_data[f"exists:{output_bucket_name}"] = exists
304
+ if exists:
305
+ pass
306
+
307
+ # ECR Repository
308
+ repo_name = ECR_CDK_REPO_NAME
309
+ exists, _ = check_ecr_repo_exists(repo_name)
310
+ context_data[f"exists:{repo_name}"] = exists
311
+ if exists:
312
+ pass # from_repository_name is sufficient
313
+
314
+ # CodeBuild Project
315
+ project_name = CODEBUILD_PROJECT_NAME
316
+ exists, _ = check_codebuild_project_exists(project_name)
317
+ context_data[f"exists:{project_name}"] = exists
318
+ if exists:
319
+ # Need a way to get the ARN from the check function
320
+ _, project_arn = check_codebuild_project_exists(
321
+ project_name
322
+ ) # Assuming it returns ARN
323
+ context_data[f"arn:{project_name}"] = project_arn
324
+
325
+ # ALB (by name lookup)
326
+ alb_name = ALB_NAME
327
+ exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
328
+ context_data[f"exists:{alb_name}"] = exists
329
+ if exists:
330
+ _, alb_object = check_alb_exists(
331
+ alb_name, region_name=AWS_REGION
332
+ ) # Assuming check returns object
333
+ print("alb_object:", alb_object)
334
+ context_data[f"arn:{alb_name}"] = alb_object["LoadBalancerArn"]
335
+
336
+ # Cognito User Pool (by name)
337
+ user_pool_name = COGNITO_USER_POOL_NAME
338
+ exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
339
+ context_data[f"exists:{user_pool_name}"] = exists
340
+ if exists:
341
+ context_data[f"id:{user_pool_name}"] = user_pool_id
342
+
343
+ # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
344
+ if user_pool_id:
345
+ user_pool_id_for_client_check = user_pool_id # context_data.get(f"id:{user_pool_name}") # Use ID from context
346
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
347
+ if user_pool_id_for_client_check:
348
+ exists, client_id, _ = check_for_existing_user_pool_client(
349
+ user_pool_client_name, user_pool_id_for_client_check
350
+ )
351
+ context_data[f"exists:{user_pool_client_name}"] = exists
352
+ if exists:
353
+ context_data[f"id:{user_pool_client_name}"] = client_id
354
+
355
+ # Secrets Manager Secret (by name)
356
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
357
+ exists, _ = check_for_secret(secret_name)
358
+ context_data[f"exists:{secret_name}"] = exists
359
+ # You might not need the ARN if using from_secret_name_v2
360
+
361
+ # WAF Web ACL (by name and scope)
362
+ web_acl_name = WEB_ACL_NAME
363
+ exists, _ = check_web_acl_exists(
364
+ web_acl_name, scope="CLOUDFRONT"
365
+ ) # Assuming check returns object
366
+ context_data[f"exists:{web_acl_name}"] = exists
367
+ if exists:
368
+ _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
369
+ context_data[f"arn:{web_acl_name}"] = existing_web_acl.attr_arn
370
+
371
+ # Write the context data to the file
372
+ with open(CONTEXT_FILE, "w") as f:
373
+ json.dump(context_data, f, indent=2)
374
+
375
+ print(f"Context data written to {CONTEXT_FILE}")
cdk/lambda_load_dynamo_logs.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lambda handler to export DynamoDB usage log table to CSV and upload to S3.
3
+
4
+ All inputs are read from environment variables (no argparse).
5
+ Intended to run as an AWS Lambda function; can also be invoked locally
6
+ by setting env vars and calling lambda_handler({}, None).
7
+
8
+ Environment variables (same semantics as load_dynamo_logs.py CLI):
9
+ DYNAMODB_TABLE_NAME - DynamoDB table name (default: redaction_usage)
10
+ AWS_REGION - AWS region (optional; if unset, uses AWS_DEFAULT_REGION,
11
+ then region from Lambda context ARN, then eu-west-2)
12
+ OUTPUT_FOLDER - Local output directory, e.g. /tmp (optional)
13
+ OUTPUT_FILENAME - Local output file name (default: dynamodb_logs_export.csv)
14
+ OUTPUT - Full local output path (overrides folder + filename if set).
15
+ In Lambda only /tmp is writable; relative paths are auto-resolved to /tmp.
16
+ FROM_DATE - Only include entries on/after this date YYYY-MM-DD (optional)
17
+ TO_DATE - Only include entries on/before this date YYYY-MM-DD (optional)
18
+ DATE_ATTRIBUTE - Attribute name for date filtering (default: timestamp)
19
+ S3_OUTPUT_BUCKET - S3 bucket for the output CSV (required for upload)
20
+ S3_OUTPUT_KEY - S3 object key/path for the output CSV (required for upload)
21
+ """
22
+
23
+ import csv
24
+ import datetime
25
+ import os
26
+ from decimal import Decimal
27
+ from io import StringIO
28
+
29
+ import boto3
30
+
31
+
32
+ def _get_region_from_context(context):
33
+ """Extract region from Lambda context invoked_function_arn (arn:aws:lambda:REGION:ACCOUNT:function:NAME)."""
34
+ if context is None:
35
+ return None
36
+ arn = getattr(context, "invoked_function_arn", None)
37
+ if not arn or not isinstance(arn, str):
38
+ return None
39
+ parts = arn.split(":")
40
+ if len(parts) >= 4:
41
+ return parts[3] # region is 4th segment
42
+ return None
43
+
44
+
45
+ def get_config_from_env(context=None):
46
+ """Read all settings from environment variables (same inputs as load_dynamo_logs.py).
47
+ When running in Lambda, context can be passed to derive region from the function ARN if env is not set.
48
+ """
49
+ today = datetime.datetime.now().date()
50
+ one_year_ago = today - datetime.timedelta(days=365)
51
+
52
+ table_name = os.environ.get("DYNAMODB_TABLE_NAME") or os.environ.get(
53
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", "redaction_usage"
54
+ )
55
+ region = (
56
+ os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
57
+ ).strip()
58
+ output = os.environ.get("OUTPUT")
59
+ output_folder = os.environ.get("OUTPUT_FOLDER", "output/")
60
+ output_filename = os.environ.get("OUTPUT_FILENAME", "dynamodb_logs_export.csv")
61
+ from_date_str = os.environ.get("FROM_DATE")
62
+ to_date_str = os.environ.get("TO_DATE")
63
+ date_attribute = os.environ.get("DATE_ATTRIBUTE", "timestamp")
64
+ s3_output_bucket = os.environ.get("S3_OUTPUT_BUCKET")
65
+ s3_output_key = os.environ.get("S3_OUTPUT_KEY")
66
+
67
+ if output:
68
+ local_output_path = output
69
+ else:
70
+ folder = output_folder.rstrip("/").rstrip("\\")
71
+ local_output_path = os.path.join(folder, output_filename)
72
+
73
+ # In AWS Lambda only /tmp is writable; resolve relative paths to /tmp to avoid read-only FS errors
74
+ if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"):
75
+ resolved = os.path.abspath(local_output_path)
76
+ if not resolved.startswith("/tmp"):
77
+ local_output_path = os.path.join(
78
+ "/tmp", os.path.basename(local_output_path)
79
+ )
80
+
81
+ # Region: env (AWS_REGION / AWS_DEFAULT_REGION) → Lambda context ARN → hardcoded fallback
82
+ if not region and context is not None:
83
+ region = _get_region_from_context(context) or ""
84
+ if not region:
85
+ region = "FILL IN DEFAULT REGION HERE"
86
+
87
+ from_date = None
88
+ to_date = None
89
+ if from_date_str:
90
+ from_date = datetime.datetime.strptime(from_date_str, "%Y-%m-%d").date()
91
+ if to_date_str:
92
+ to_date = datetime.datetime.strptime(to_date_str, "%Y-%m-%d").date()
93
+ if from_date is None and to_date is None:
94
+ from_date = one_year_ago
95
+ to_date = today
96
+ elif from_date is None:
97
+ from_date = one_year_ago
98
+ elif to_date is None:
99
+ to_date = today
100
+
101
+ return {
102
+ "table_name": table_name,
103
+ "region": region,
104
+ "local_output_path": local_output_path,
105
+ "from_date": from_date,
106
+ "to_date": to_date,
107
+ "date_attribute": date_attribute,
108
+ "s3_output_bucket": s3_output_bucket,
109
+ "s3_output_key": s3_output_key,
110
+ }
111
+
112
+
113
+ # Helper function to convert Decimal to float or int
114
+ def convert_types(item):
115
+ new_item = {}
116
+ for key, value in item.items():
117
+ if isinstance(value, Decimal):
118
+ new_item[key] = int(value) if value % 1 == 0 else float(value)
119
+ elif isinstance(value, str):
120
+ try:
121
+ dt_obj = datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
122
+ new_item[key] = dt_obj.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
123
+ except (ValueError, TypeError):
124
+ new_item[key] = value
125
+ else:
126
+ new_item[key] = value
127
+ return new_item
128
+
129
+
130
+ def _parse_item_date(value):
131
+ """Parse a DynamoDB attribute value to datetime for comparison. Returns None if unparseable."""
132
+ if value is None:
133
+ return None
134
+ if isinstance(value, Decimal):
135
+ try:
136
+ return datetime.datetime.utcfromtimestamp(float(value))
137
+ except (ValueError, OSError):
138
+ return None
139
+ if isinstance(value, (int, float)):
140
+ try:
141
+ return datetime.datetime.utcfromtimestamp(float(value))
142
+ except (ValueError, OSError):
143
+ return None
144
+ if isinstance(value, str):
145
+ for fmt in (
146
+ "%Y-%m-%d %H:%M:%S.%f",
147
+ "%Y-%m-%d %H:%M:%S",
148
+ "%Y-%m-%d",
149
+ "%Y-%m-%dT%H:%M:%S",
150
+ ):
151
+ try:
152
+ return datetime.datetime.strptime(value, fmt)
153
+ except (ValueError, TypeError):
154
+ continue
155
+ try:
156
+ return datetime.datetime.fromisoformat(value.replace("Z", "+00:00"))
157
+ except (ValueError, TypeError):
158
+ pass
159
+ return None
160
+
161
+
162
+ def filter_items_by_date(items, from_date, to_date, date_attribute: str):
163
+ """Return items whose date attribute falls within [from_date, to_date] (inclusive)."""
164
+ if from_date is None and to_date is None:
165
+ return items
166
+ start = datetime.datetime.combine(from_date, datetime.time.min)
167
+ end = datetime.datetime.combine(to_date, datetime.time.max)
168
+ filtered = []
169
+ for item in items:
170
+ raw = item.get(date_attribute)
171
+ dt = _parse_item_date(raw)
172
+ if dt is None:
173
+ continue
174
+ if dt.tzinfo:
175
+ dt = dt.replace(tzinfo=None)
176
+ if start <= dt <= end:
177
+ filtered.append(item)
178
+ return filtered
179
+
180
+
181
+ def scan_table(table):
182
+ """Paginated scan of DynamoDB table."""
183
+ items = []
184
+ response = table.scan()
185
+ items.extend(response["Items"])
186
+ while "LastEvaluatedKey" in response:
187
+ response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
188
+ items.extend(response["Items"])
189
+ return items
190
+
191
+
192
+ def export_to_csv_buffer(items, fields_to_drop=None):
193
+ """
194
+ Write items to a CSV in memory; return (csv_string, fieldnames).
195
+ Use for uploading to S3 without writing to disk.
196
+ """
197
+ if not items:
198
+ return "", []
199
+
200
+ drop_set = set(fields_to_drop or [])
201
+ all_keys = set()
202
+ for item in items:
203
+ all_keys.update(item.keys())
204
+ fieldnames = sorted(list(all_keys - drop_set))
205
+
206
+ buf = StringIO()
207
+ writer = csv.DictWriter(
208
+ buf, fieldnames=fieldnames, extrasaction="ignore", restval=""
209
+ )
210
+ writer.writeheader()
211
+ for item in items:
212
+ writer.writerow(convert_types(item))
213
+ return buf.getvalue(), fieldnames
214
+
215
+
216
+ def export_to_csv_file(items, output_path, fields_to_drop=None):
217
+ """Write items to a CSV file (for optional /tmp or local path)."""
218
+ csv_string, _ = export_to_csv_buffer(items, fields_to_drop)
219
+ if not csv_string:
220
+ return
221
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)) or ".", exist_ok=True)
222
+ with open(output_path, "w", newline="", encoding="utf-8-sig") as f:
223
+ f.write(csv_string)
224
+
225
+
226
+ def run_export(config):
227
+ """
228
+ Run the full export: scan DynamoDB, filter by date, write CSV (buffer and/or file), upload to S3.
229
+ """
230
+ table_name = config["table_name"]
231
+ region = config["region"]
232
+ local_output_path = config["local_output_path"]
233
+ from_date = config["from_date"]
234
+ to_date = config["to_date"]
235
+ date_attribute = config["date_attribute"]
236
+ s3_output_bucket = config["s3_output_bucket"]
237
+ s3_output_key = config["s3_output_key"]
238
+
239
+ if from_date > to_date:
240
+ raise ValueError("FROM_DATE must be on or before TO_DATE")
241
+
242
+ dynamodb = boto3.resource("dynamodb", region_name=region or None)
243
+ table = dynamodb.Table(table_name)
244
+
245
+ items = scan_table(table)
246
+ items = filter_items_by_date(items, from_date, to_date, date_attribute)
247
+
248
+ csv_string, fieldnames = export_to_csv_buffer(items, fields_to_drop=[])
249
+ result = {
250
+ "item_count": len(items),
251
+ "from_date": str(from_date),
252
+ "to_date": str(to_date),
253
+ "columns": fieldnames,
254
+ }
255
+
256
+ if csv_string:
257
+ # Optional: write to local path (e.g. /tmp in Lambda)
258
+ try:
259
+ export_to_csv_file(items, local_output_path, fields_to_drop=[])
260
+ result["local_path"] = local_output_path
261
+ except Exception as e:
262
+ result["local_write_error"] = str(e)
263
+
264
+ # Upload to S3 if bucket and key are set
265
+ if s3_output_bucket and s3_output_key:
266
+ s3 = boto3.client("s3", region_name=region or None)
267
+ s3.put_object(
268
+ Bucket=s3_output_bucket,
269
+ Key=s3_output_key,
270
+ Body=csv_string.encode("utf-8-sig"),
271
+ ContentType="text/csv; charset=utf-8",
272
+ )
273
+ result["s3_uri"] = f"s3://{s3_output_bucket}/{s3_output_key}"
274
+ elif s3_output_bucket or s3_output_key:
275
+ result["s3_skip_reason"] = (
276
+ "Both S3_OUTPUT_BUCKET and S3_OUTPUT_KEY must be set"
277
+ )
278
+
279
+ return result
280
+
281
+
282
+ def lambda_handler(event, context):
283
+ """
284
+ AWS Lambda entrypoint. Config is read from environment variables.
285
+
286
+ Event is not required for config; it can be used to override env vars
287
+ (e.g. pass table_name, from_date, to_date, s3_output_bucket, s3_output_key).
288
+ """
289
+ config = get_config_from_env(context=context)
290
+
291
+ # Optional: allow event to override env-based config
292
+ if isinstance(event, dict):
293
+ if event.get("table_name"):
294
+ config["table_name"] = event["table_name"]
295
+ if event.get("region"):
296
+ config["region"] = event["region"]
297
+ if event.get("from_date"):
298
+ config["from_date"] = datetime.datetime.strptime(
299
+ event["from_date"], "%Y-%m-%d"
300
+ ).date()
301
+ if event.get("to_date"):
302
+ config["to_date"] = datetime.datetime.strptime(
303
+ event["to_date"], "%Y-%m-%d"
304
+ ).date()
305
+ if event.get("date_attribute"):
306
+ config["date_attribute"] = event["date_attribute"]
307
+ if event.get("s3_output_bucket"):
308
+ config["s3_output_bucket"] = event["s3_output_bucket"]
309
+ if event.get("s3_output_key"):
310
+ config["s3_output_key"] = event["s3_output_key"]
311
+
312
+ result = run_export(config)
313
+ return {"statusCode": 200, "body": result}
314
+
315
+
316
+ if __name__ == "__main__":
317
+ # Allow running locally with env vars set
318
+ import json
319
+
320
+ result = lambda_handler({}, None)
321
+ print(json.dumps(result, indent=2))
cdk/post_cdk_build_quickstart.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from cdk_config import (
4
+ CLUSTER_NAME,
5
+ CODEBUILD_PROJECT_NAME,
6
+ ECS_SERVICE_NAME,
7
+ S3_LOG_CONFIG_BUCKET_NAME,
8
+ )
9
+ from cdk_functions import (
10
+ create_basic_config_env,
11
+ start_codebuild_build,
12
+ start_ecs_task,
13
+ upload_file_to_s3,
14
+ )
15
+ from tqdm import tqdm
16
+
17
+ # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
18
+ create_basic_config_env("config")
19
+
20
+ # Start codebuild build
21
+ print("Starting CodeBuild project.")
22
+ start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
23
+
24
+ # Upload config.env file to S3 bucket
25
+ upload_file_to_s3(
26
+ local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME
27
+ )
28
+
29
+ total_seconds = 660 # 11 minutes
30
+ update_interval = 1 # Update every second
31
+
32
+ print("Waiting 11 minutes for the CodeBuild container to build.")
33
+
34
+ # tqdm iterates over a range, and you perform a small sleep in each iteration
35
+ for i in tqdm(range(total_seconds), desc="Building container"):
36
+ time.sleep(update_interval)
37
+
38
+ # Start task on ECS
39
+ print("Starting ECS task")
40
+ start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)
cdk/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ aws-cdk-lib==2.243.0
2
+ boto3==1.42.61
3
+ pandas==2.3.3
4
+ nodejs==0.1.1
5
+ python-dotenv==1.0.1
cli_redact.py ADDED
The diff for this file is too large to render. See raw diff
 
docker-compose_llama.yml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
2
+ # COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf
3
+ # COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf
4
+ # The app always talks to http://llama-inference:8080 (shared network alias on both model services).
5
+ # Each model service uses its own llama.cpp cache volume so mmproj-F16.gguf (same filename per repo)
6
+ # is never shared between 35B and 27B downloads.
7
+ # Example CLI commands:
8
+ # docker compose -f docker-compose_llama.yml --profile 35b up -d
9
+ # docker compose -f docker-compose_llama.yml --profile 27b up -d
10
+ services:
11
+ # Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
12
+ qwen35-35b_q4_gguf:
13
+ profiles: ["35b"]
14
+ image: ghcr.io/ggml-org/llama.cpp:server-cuda12
15
+ command:
16
+ - -hf
17
+ - unsloth/Qwen3.5-35B-A3B-GGUF
18
+ - --hf-file
19
+ - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
20
+ - --mmproj-url
21
+ - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
22
+ - --n-gpu-layers
23
+ - "999"
24
+ - --ctx-size
25
+ - "32768"
26
+ - --fit
27
+ - "off"
28
+ - --temp
29
+ - "0.7"
30
+ - --top-k
31
+ - "20"
32
+ - --top-p
33
+ - "0.8"
34
+ - --min-p
35
+ - "0.0"
36
+ - --frequency-penalty
37
+ - "1"
38
+ - --presence-penalty
39
+ - "1"
40
+ - --host
41
+ - "0.0.0.0"
42
+ - --port
43
+ - "8080"
44
+ - --no-warmup
45
+ - --seed
46
+ - "42"
47
+ - --n-cpu-moe
48
+ - "0" # Increase this value to fit within your availableVRAM
49
+ ports:
50
+ - "8001:8080"
51
+ volumes:
52
+ - ./models:/models
53
+ - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
54
+ deploy:
55
+ resources:
56
+ reservations:
57
+ devices:
58
+ - driver: nvidia
59
+ count: all
60
+ capabilities: [gpu]
61
+ healthcheck:
62
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
63
+ interval: 30s
64
+ timeout: 15s
65
+ retries: 8
66
+ start_period: 1200s
67
+ networks:
68
+ redaction-net-llama:
69
+ aliases:
70
+ - llama-inference
71
+
72
+ # Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
73
+ qwen35-27b_q4_gguf:
74
+ profiles: ["27b"]
75
+ image: ghcr.io/ggml-org/llama.cpp:server-cuda12
76
+ command:
77
+ - -hf
78
+ - unsloth/Qwen3.5-27B-GGUF
79
+ - --hf-file
80
+ - Qwen3.5-27B-UD-Q4_K_XL.gguf
81
+ - --mmproj-url
82
+ - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
83
+ - --n-gpu-layers
84
+ - "999"
85
+ - --ctx-size
86
+ - "32768"
87
+ - --fit
88
+ - "off"
89
+ - --temp
90
+ - "0.7"
91
+ - --top-k
92
+ - "20"
93
+ - --top-p
94
+ - "0.8"
95
+ - --min-p
96
+ - "0.0"
97
+ - --frequency-penalty
98
+ - "1"
99
+ - --presence-penalty
100
+ - "1"
101
+ - --host
102
+ - "0.0.0.0"
103
+ - --port
104
+ - "8080"
105
+ - --no-warmup
106
+ - --seed
107
+ - "42"
108
+ ports:
109
+ - "8000:8080"
110
+ volumes:
111
+ - ./models:/models
112
+ - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
113
+ deploy:
114
+ resources:
115
+ reservations:
116
+ devices:
117
+ - driver: nvidia
118
+ count: all
119
+ capabilities: [gpu]
120
+ healthcheck:
121
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"]
122
+ interval: 30s
123
+ timeout: 15s
124
+ retries: 8
125
+ start_period: 1200s
126
+ networks:
127
+ redaction-net-llama:
128
+ aliases:
129
+ - llama-inference
130
+
131
+ redaction-app-llama:
132
+ profiles: ["35b", "27b"]
133
+ image: redaction-app-main
134
+ build:
135
+ context: . # Look in the current folder
136
+ dockerfile: Dockerfile # Use this file
137
+ target: gradio # Use the 'gradio' stage from your Dockerfile
138
+ args: # Pass your build-time variables here!
139
+ - TORCH_GPU_ENABLED=False
140
+ - INSTALL_VLM=False
141
+ - PADDLE_GPU_ENABLED=True
142
+ - INSTALL_PADDLEOCR=True
143
+ shm_size: '8gb'
144
+ depends_on:
145
+ qwen35-35b_q4_gguf:
146
+ condition: service_healthy
147
+ required: false
148
+ qwen35-27b_q4_gguf:
149
+ condition: service_healthy
150
+ required: false
151
+ environment:
152
+ - FLAGS_fraction_of_gpu_memory_to_use=0.05
153
+ - RUN_FASTAPI=True
154
+ - APP_MODE=fastapi
155
+ - SHOW_PADDLE_MODEL_OPTIONS=True
156
+ - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
157
+ - SHOW_LOCAL_PII_DETECTION_OPTIONS=True
158
+ - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
159
+ - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
160
+ - SHOW_HYBRID_MODELS=True
161
+ - SHOW_DIFFICULT_OCR_EXAMPLES=True
162
+ - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
163
+ - SHOW_SUMMARISATION=True
164
+ - SHOW_AWS_API_KEYS=True
165
+ - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
166
+ - DEFAULT_LOCAL_OCR_MODEL=paddle
167
+ - DEFAULT_PII_DETECTION_MODEL=Local
168
+ - INFERENCE_SERVER_API_URL=http://llama-inference:8080
169
+ - DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
170
+ - DEFAULT_INFERENCE_SERVER_PII_MODEL=""
171
+ - CUSTOM_VLM_BACKEND=inference_vlm
172
+ - MAX_WORKERS=12
173
+ - TESSERACT_MAX_WORKERS=8
174
+ - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
175
+ - LOAD_PADDLE_AT_STARTUP=False
176
+ - EFFICIENT_OCR=True
177
+ - SHOW_CUSTOM_VLM_ENTITIES=True
178
+ - SESSION_OUTPUT_FOLDER=True
179
+ - SAVE_PAGE_OCR_VISUALISATIONS=False
180
+ - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
181
+ - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
182
+ - PREPROCESS_LOCAL_OCR_IMAGES=False
183
+ - INFERENCE_SERVER_DISABLE_THINKING=True
184
+ - MAX_NEW_TOKENS=16384
185
+ - SAVE_EXAMPLE_HYBRID_IMAGES=False
186
+ - SAVE_VLM_INPUT_IMAGES=False
187
+ - VLM_MAX_DPI=200.0
188
+ - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
189
+ - REPORT_VLM_OUTPUTS_TO_GUI=True
190
+ - REPORT_LLM_OUTPUTS_TO_GUI=True
191
+ - ADD_VLM_BOUNDING_BOX_RULES=False
192
+
193
+ deploy:
194
+ resources:
195
+ reservations:
196
+ devices:
197
+ - driver: nvidia
198
+ count: all
199
+ capabilities: [gpu]
200
+ ports:
201
+ - "7861:7860"
202
+ networks:
203
+ - redaction-net-llama
204
+
205
+ networks:
206
+ redaction-net-llama:
207
+ driver: bridge
208
+
209
+ volumes:
210
+ hf-llama-cache-qwen35-35b:
211
+ hf-llama-cache-qwen35-27b:
docker-compose_vllm.yml ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
2
+ # to match the served model (required for 27b; 9b defaults below if omitted):
3
+ # COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
4
+ # COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
5
+ # App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
6
+ # Example CLI commands:
7
+ # docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d
8
+ # docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
9
+ services:
10
+ vllm-server-qwen35-9b:
11
+ profiles: ["vllm-9b"]
12
+ image: vllm/vllm-openai:latest
13
+ shm_size: '8gb'
14
+ command: |
15
+ --model QuantTrio/Qwen3.5-9B-AWQ
16
+ --gpu-memory-utilization 0.7
17
+ --tensor-parallel-size 1
18
+ --max-num-seqs 1
19
+ --reasoning-parser qwen3
20
+ --max-model-len 32768
21
+ --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
22
+ --max-num-batched-tokens 2048
23
+
24
+ deploy:
25
+ resources:
26
+ reservations:
27
+ devices:
28
+ - driver: nvidia
29
+ count: all
30
+ capabilities: [gpu]
31
+ healthcheck:
32
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
33
+ interval: 30s
34
+ timeout: 15s
35
+ retries: 8
36
+ start_period: 1200s
37
+ ports:
38
+ - "8000:8000"
39
+ volumes:
40
+ - hf-model-cache:/root/.cache/huggingface
41
+ networks:
42
+ redaction-net-vllm:
43
+ aliases:
44
+ - vllm-inference
45
+
46
+ vllm-server-qwen35-27b:
47
+ profiles: ["vllm-27b"]
48
+ image: vllm/vllm-openai:latest
49
+ shm_size: '16gb'
50
+ command: |
51
+ --model QuantTrio/Qwen3.5-27B-AWQ
52
+ --gpu-memory-utilization 0.94
53
+ --tensor-parallel-size 1
54
+ --max-num-seqs 2
55
+ --reasoning-parser qwen3
56
+ --max-model-len 16384
57
+ --max-num-batched-tokens 4096
58
+ --enforce-eager
59
+ --kv-cache-dtype fp8
60
+ --enable-chunked-prefill
61
+ --enable-prefix-caching
62
+
63
+ deploy:
64
+ resources:
65
+ reservations:
66
+ devices:
67
+ - driver: nvidia
68
+ count: all
69
+ capabilities: [gpu]
70
+ healthcheck:
71
+ test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"]
72
+ interval: 30s
73
+ timeout: 15s
74
+ retries: 8
75
+ start_period: 1200s
76
+ ports:
77
+ - "8001:8000"
78
+ volumes:
79
+ - hf-model-cache:/root/.cache/huggingface
80
+ networks:
81
+ redaction-net-vllm:
82
+ aliases:
83
+ - vllm-inference
84
+
85
+ redaction-app-vllm:
86
+ profiles: ["vllm-9b", "vllm-27b"]
87
+ image: redaction-app-main
88
+ build:
89
+ context: . # Look in the current folder
90
+ dockerfile: Dockerfile # Use this file
91
+ target: gradio # Use the 'gradio' stage from your Dockerfile
92
+ args: # Pass your build-time variables here!
93
+ - TORCH_GPU_ENABLED=False
94
+ - INSTALL_VLM=False
95
+ - PADDLE_GPU_ENABLED=True
96
+ - INSTALL_PADDLEOCR=True
97
+ shm_size: '8gb'
98
+ depends_on:
99
+ vllm-server-qwen35-9b:
100
+ condition: service_healthy
101
+ required: false
102
+ vllm-server-qwen35-27b:
103
+ condition: service_healthy
104
+ required: false
105
+ environment:
106
+ - FLAGS_fraction_of_gpu_memory_to_use=0.05
107
+ - RUN_FASTAPI=True
108
+ - APP_MODE=fastapi
109
+ - SHOW_PADDLE_MODEL_OPTIONS=True
110
+ - SHOW_LOCAL_OCR_MODEL_OPTIONS=True
111
+ - SHOW_INFERENCE_SERVER_PII_OPTIONS=True
112
+ - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
113
+ - SHOW_HYBRID_MODELS=True
114
+ - SHOW_DIFFICULT_OCR_EXAMPLES=True
115
+ - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
116
+ - SHOW_SUMMARISATION=True
117
+ - SHOW_AWS_API_KEYS=True
118
+ - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
119
+ - DEFAULT_LOCAL_OCR_MODEL=paddle
120
+ - DEFAULT_PII_DETECTION_MODEL=Local
121
+ - CUSTOM_VLM_BACKEND=inference_vlm
122
+ - MAX_WORKERS=12
123
+ - TESSERACT_MAX_WORKERS=8
124
+ - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
125
+ - LOAD_PADDLE_AT_STARTUP=False
126
+ - INFERENCE_SERVER_API_URL=http://vllm-inference:8000
127
+ - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
128
+ - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
129
+ - EFFICIENT_OCR=True
130
+ - SHOW_CUSTOM_VLM_ENTITIES=True
131
+ - SESSION_OUTPUT_FOLDER=True
132
+ - SAVE_PAGE_OCR_VISUALISATIONS=False
133
+ - HYBRID_OCR_CONFIDENCE_THRESHOLD=97
134
+ - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
135
+ - PREPROCESS_LOCAL_OCR_IMAGES=False
136
+ - INFERENCE_SERVER_DISABLE_THINKING=True
137
+ - MAX_NEW_TOKENS=16384
138
+ - SAVE_EXAMPLE_HYBRID_IMAGES=False
139
+ - SAVE_VLM_INPUT_IMAGES=False
140
+ - VLM_MAX_DPI=200.0
141
+ - DEFAULT_NEW_BATCH_CHAR_COUNT=1250
142
+ - REPORT_VLM_OUTPUTS_TO_GUI=True
143
+ - REPORT_LLM_OUTPUTS_TO_GUI=True
144
+ - ADD_VLM_BOUNDING_BOX_RULES=False
145
+
146
+ deploy:
147
+ resources:
148
+ reservations:
149
+ devices:
150
+ - driver: nvidia
151
+ count: all
152
+ capabilities: [gpu]
153
+ ports:
154
+ - "7860:7860"
155
+ networks:
156
+ - redaction-net-vllm
157
+
158
+ networks:
159
+ redaction-net-vllm:
160
+ driver: bridge
161
+
162
+ volumes:
163
+ hf-model-cache:
entrypoint.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ # Exit immediately if a command exits with a non-zero status.
4
+ set -e
5
+
6
+ echo "Starting in APP_MODE: $APP_MODE"
7
+
8
+ # --- Ensure application directories are writable by the current user ---
9
+ # This is important when Docker volumes are bind-mounted from the host and
10
+ # the host directory may be owned by root (uid 0), which would prevent the
11
+ # non-root container user (uid 1000) from writing output/input files.
12
+ for dir in \
13
+ "${GRADIO_OUTPUT_FOLDER:-/home/user/app/output}" \
14
+ "${GRADIO_INPUT_FOLDER:-/home/user/app/input}" \
15
+ "${GRADIO_TEMP_DIR:-/tmp/gradio_tmp}" \
16
+ "${ACCESS_LOGS_FOLDER:-/home/user/app/logs}" \
17
+ "${USAGE_LOGS_FOLDER:-/home/user/app/usage}" \
18
+ "${FEEDBACK_LOGS_FOLDER:-/home/user/app/feedback}" \
19
+ "${CONFIG_FOLDER:-/home/user/app/config}"; do
20
+ mkdir -p "$dir" 2>/dev/null || true
21
+ if [ ! -w "$dir" ]; then
22
+ echo "WARNING: Directory $dir is not writable by current user (uid=$(id -u)). File I/O will fail." >&2
23
+ fi
24
+ done
25
+
26
+ # --- Start the app based on mode ---
27
+
28
+ if [ "$APP_MODE" = "lambda" ]; then
29
+ echo "Starting in Lambda mode..."
30
+ # The CMD from Dockerfile will be passed as "$@"
31
+ exec python -m awslambdaric "$@"
32
+ else
33
+ echo "Starting in Gradio/FastAPI mode..."
34
+
35
+ if [ "$RUN_FASTAPI" = "True" ]; then
36
+ echo "Starting in FastAPI mode..."
37
+
38
+ GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME:-0.0.0.0}
39
+ GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
40
+
41
+ # Start uvicorn server.
42
+ echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
43
+ exec uvicorn app:app \
44
+ --host $GRADIO_SERVER_NAME \
45
+ --port $GRADIO_SERVER_PORT \
46
+ --proxy-headers \
47
+ --forwarded-allow-ips "*"
48
+ else
49
+ echo "Starting in Gradio mode..."
50
+ exec python app.py
51
+ fi
52
+ fi
example_app_config.env ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start
2
+
3
+ # General app run options
4
+ TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH
5
+ POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH
6
+
7
+ GRADIO_SERVER_NAME=127.0.0.1
8
+ GRADIO_SERVER_PORT=7860
9
+
10
+ USER_GUIDE_URL=<ENTER_URL>
11
+ CUSTOM_BOX_COLOUR=(128, 128, 128)
12
+ RUN_FASTAPI=False
13
+ FAVICON_PATH=favicon.png
14
+ INTRO_TEXT=intros/short_intro.txt
15
+
16
+ # GUI options
17
+ SHOW_QUICKSTART=False
18
+ SHOW_SUMMARISATION=True
19
+ SHOW_EXAMPLES=True
20
+ SHOW_DIFFICULT_OCR_EXAMPLES=True
21
+ SHOW_LANGUAGE_SELECTION=True
22
+ SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=False
23
+ SHOW_COSTS=True
24
+ SHOW_LOCAL_OCR_MODEL_OPTIONS=True
25
+ SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
26
+ SHOW_PII_IDENTIFICATION_OPTIONS=True
27
+ SHOW_LOCAL_PII_DETECTION_OPTIONS=True
28
+ SHOW_OCR_GUI_OPTIONS=True
29
+ EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT=True
30
+
31
+ # Model / redaction process options
32
+ DEFAULT_LOCAL_OCR_MODEL=tesseract
33
+ OVERWRITE_EXISTING_OCR_RESULTS=False
34
+ PREPROCESS_LOCAL_OCR_IMAGES=False # Whether to apply corrections to input images before processing. Will slow down redaction processes
35
+ MAX_WORKERS=4 # How many workers should be working in parallel to run various text extraction/redaction tasks. Adjust depending on how many CPUs your computer has
36
+
37
+ EFFICIENT_OCR=True
38
+ OVERWRITE_EXISTING_OCR_RESULTS=True
39
+ INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
40
+
41
+ # Redaction box appearance
42
+ CUSTOM_BOX_COLOUR=(128, 128, 128)
43
+ USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False
44
+
45
+ # Image save options
46
+ SAVE_PAGE_OCR_VISUALISATIONS=True
47
+ SAVE_PREPROCESS_IMAGES=True
48
+
49
+ # Saving and logging variables
50
+ SAVE_LOGS_TO_CSV=True
51
+ SESSION_OUTPUT_FOLDER=True # Save outputs into user session folders
52
+ DISPLAY_FILE_NAMES_IN_LOGS=False
53
+
54
+ # PaddleOCR
55
+ SHOW_PADDLE_MODEL_OPTIONS=False
56
+ LOAD_PADDLE_AT_STARTUP=False
57
+ PADDLE_MAX_WORKERS=4 # Number of simultaneous workers for Paddle OCR tasks. Generally advised to keep at 1, but may work with 2 or more depending on your system.
58
+
59
+ # GUI show VLM/LLM models
60
+ SHOW_HYBRID_MODELS=False
61
+ SHOW_CUSTOM_VLM_ENTITIES=False
62
+ SHOW_VLM_MODEL_OPTIONS=True
63
+ SHOW_INFERENCE_SERVER_PII_OPTIONS=False
64
+ SHOW_INFERENCE_SERVER_VLM_OPTIONS=False
65
+ SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS=False
66
+
67
+ # VLM using Transformers options
68
+ SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL=Qwen3.5-9B
69
+ QUANTISE_VLM_MODELS=False
70
+ USE_TRANSFORMERS_VLM_MODEL_AS_LLM=True
71
+ LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE=None
72
+ QUANTISE_TRANSFORMERS_LLM_MODELS=False
73
+ LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START=False
74
+ LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True
75
+
76
+ # VLM using inference server options (vLLM / Llama.cpp server)
77
+ INFERENCE_SERVER_API_URL=http://192.168.0.220:8080
78
+ USE_LLAMA_SWAP=True
79
+ INFERENCE_SERVER_LLM_PII_MODEL_CHOICE=qwen_3_5_27b
80
+
81
+ # General VLM / LLM options
82
+ VLM_DISABLE_QWEN3_5_THINKING=True
83
+ LLM_MAX_NEW_TOKENS=8192
84
+ CUSTOM_VLM_BACKEND=bedrock_vlm # Which model type to use to do face / signature detection. Can choose from "transformers_vlm", "inference_vlm", "bedrock_vlm"
85
+
86
+ # AWS related variables
87
+ RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them
88
+ AWS_REGION=example-region
89
+ DOCUMENT_REDACTION_BUCKET=example-bucket
90
+
91
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
92
+ SHOW_AWS_PII_DETECTION_OPTIONS=True
93
+
94
+ SHOW_AWS_EXAMPLES=True
95
+ RUN_ALL_EXAMPLES_THROUGH_AWS=True
96
+
97
+ SAVE_LOGS_TO_DYNAMODB=True
98
+ ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log
99
+ USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage
100
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback
101
+
102
+ # AWS Textract options
103
+ SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True
104
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True
105
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output
106
+ INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION=False
107
+ INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION=False
108
+ INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION=False
109
+ INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION=False # Needs a VLM option available to work
110
+
111
+ # AWS VLM / LLM options
112
+ SHOW_BEDROCK_VLM_MODELS=False
113
+ SHOW_AWS_BEDROCK_LLM_MODELS=False
114
+ HYBRID_TEXTRACT_BEDROCK_VLM=False
115
+
116
+ CLOUD_LLM_PII_MODEL_CHOICE=amazon.nova-pro-v1:0
117
+ CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE=anthropic.claude-sonnet-4-6 #amazon.nova-pro-v1:0
118
+ CLOUD_VLM_MODEL_CHOICE=amazon.nova-pro-v1:0 # other possibles: anthropic.claude-sonnet-4-6 #qwen.qwen3-vl-235b-a22b # anthropic.claude-sonnet-4-6 #
119
+ CLOUD_SUMMARISATION_MODEL_CHOICE=amazon.nova-lite-v1:0
120
+
121
+ # Cost code related variables
122
+ SHOW_COSTS=True
123
+ GET_COST_CODES=True
124
+ COST_CODES_PATH=config/cost_codes.csv
125
+ ENFORCE_COST_CODES=True
126
+ DEFAULT_COST_CODE=example_cost_code
127
+
128
+ # S3 cost codes
129
+ S3_COST_CODES_PATH=cost_codes.csv
example_data/Bold minimalist professional cover letter.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
3
+ size 23992
example_data/Difficult handwritten note.jpg ADDED

Git LFS Details

  • SHA256: 28896bfa4c4d6ef48222a285c02529dc8967d15d799df5c4b4cf0f62224e7b6c
  • Pointer size: 130 Bytes
  • Size of remote file: 85.1 kB
example_data/Example-cv-university-graduaty-hr-role-with-photo-2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caf00ca5cb06b8019804d1a7eaeceec772607969e8cad6c34d1d583876345b90
3
+ size 116763
example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv ADDED
The diff for this file is too large to render. See raw diff
 
example_data/Partnership-Agreement-Toolkit_0_0.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
3
+ size 426602
example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ another country or territory sign a formel agreement on behalf? of their communities endorsing a
2
+ soster citues international
example_data/combined_case_notes.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Date,Social Worker,Client,Case Note
2
+ "January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
3
+ "January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
4
+ "February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
5
+ "February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
6
+ "March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
7
+ "March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
8
+ "April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
9
+ "April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
10
+ "May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
11
+ "January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
12
+ "January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
13
+ "February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
14
+ "February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
15
+ "March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
16
+ "March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
17
+ "April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
18
+ "April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
19
+ "May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."
example_data/combined_case_notes.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09300597024591d0b5b4ef97faef12fcceb28fcbb6ea09260bc42f43967753a4
3
+ size 12579
example_data/doubled_output_joined.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
3
+ size 1274719
example_data/example_complaint_letter.jpg ADDED

Git LFS Details

  • SHA256: db33b67ebe685132a589593e4a3ca05f2dbce358b63de9142c2f2a36202e3f15
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
example_data/example_of_emails_sent_to_a_professor_before_applying.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
3
+ size 8848
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,text,left,top,width,height,line
2
+ 1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
3
+ 1,SisterCities,0.169804,0.033333,0.238431,0.028182,2
4
+ 1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
5
+ 1,Toolkit,0.830588,0.07303,0.126667,0.025152,4
6
+ 1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,5
7
+ 1,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
8
+ 1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
9
+ 1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
10
+ 1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
11
+ 1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
12
+ 1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
13
+ 1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
14
+ 1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
15
+ 1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
16
+ 1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
17
+ 1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
18
+ 1,Friendship City,0.118039,0.372121,0.127059,0.013939,17
19
+ 1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
20
+ 1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
21
+ 1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
22
+ 1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
23
+ 1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
24
+ 1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
25
+ 1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
26
+ 1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
27
+ 1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
28
+ 1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
29
+ 1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
30
+ 1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
31
+ 1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
32
+ 1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
33
+ 1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
34
+ 1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
35
+ 1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
36
+ 1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
37
+ 1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
38
+ 1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
39
+ 1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
40
+ 1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
41
+ 1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
42
+ 1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
43
+ 1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
44
+ 1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
45
+ 1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
46
+ 1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
47
+ 2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,1
48
+ 2,SisterCities,0.169804,0.033333,0.238824,0.028182,2
49
+ 2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
50
+ 2,Toolkit,0.83098,0.072727,0.127059,0.025455,4
51
+ 2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,5
52
+ 2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
53
+ 2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
54
+ 2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
55
+ 2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
56
+ 2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
57
+ 2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
58
+ 2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
59
+ 2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
60
+ 2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
61
+ 2,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
62
+ 2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
63
+ 2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
64
+ 2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
65
+ 2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
66
+ 2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
67
+ 2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
68
+ 2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
69
+ 2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
70
+ 2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
71
+ 2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
72
+ 2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
73
+ 2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
74
+ 2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
75
+ 2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
76
+ 2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
77
+ 2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
78
+ 2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
79
+ 2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
80
+ 2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
81
+ 2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
82
+ 2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
83
+ 2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
84
+ 2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
85
+ 2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
86
+ 2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
87
+ 2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
88
+ 2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
89
+ 2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
90
+ 2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
91
+ 2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,1
92
+ 2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,2
93
+ 3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,3
94
+ 3,SisterCities,0.169804,0.033333,0.239216,0.028182,4
95
+ 3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,5
96
+ 3,Toolkit,0.83098,0.07303,0.126667,0.025152,6
97
+ 3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,7
98
+ 3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,8
99
+ 3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,9
100
+ 3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,10
101
+ 3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,11
102
+ 3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,12
103
+ 3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,13
104
+ 3,and cooperation.,0.176471,0.25697,0.13451,0.013333,14
105
+ 3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,15
106
+ 3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,16
107
+ 3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,17
108
+ 3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,18
109
+ 3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,19
110
+ 3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,20
111
+ 3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,21
112
+ 3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,22
113
+ 3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,23
114
+ 3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,24
115
+ 3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,25
116
+ 3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,26
117
+ 3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,27
118
+ 3,for their records.,0.176078,0.550606,0.131373,0.010606,28
119
+ 3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,29
120
+ 3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,30
121
+ 3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,31
122
+ 3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,32
123
+ 3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,33
124
+ 3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,34
125
+ 3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,35
126
+ 3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,36
127
+ 3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,37
128
+ 3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,38
129
+ 3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,39
130
+ 3,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,40
131
+ 3,347-8630.,0.117647,0.799394,0.080392,0.010303,41
132
+ 4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,1
133
+ 4,SisterCities,0.169412,0.033333,0.239608,0.028485,2
134
+ 4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
135
+ 4,Toolkit,0.830588,0.072727,0.127843,0.025758,4
136
+ 4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
137
+ 4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
138
+ 4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
139
+ 4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
140
+ 4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
141
+ 4,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
142
+ 4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
143
+ 4,AND,0.487843,0.452727,0.048235,0.011212,12
144
+ 4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
145
+ 4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
146
+ 4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
147
+ 4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
148
+ 4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
149
+ 4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
150
+ 4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
151
+ 4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
152
+ 4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
153
+ 4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
154
+ 4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
155
+ 4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
156
+ 4,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
157
+ 4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
158
+ 4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
159
+ 4,A,0.344314,0.768485,0.084706,0.030303,28
160
+ 4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
161
+ 4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
162
+ 4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,31
163
+ 4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,32
164
+ 4,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
165
+ 5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
166
+ 5,SisterCities,0.169412,0.033333,0.239608,0.028485,2
167
+ 5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
168
+ 5,Toolkit,0.83098,0.072727,0.127059,0.025758,4
169
+ 5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
170
+ 5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
171
+ 5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
172
+ 5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
173
+ 5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
174
+ 5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
175
+ 5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
176
+ 5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
177
+ 5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
178
+ 5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
179
+ 5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
180
+ 5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
181
+ 5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
182
+ 5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
183
+ 5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
184
+ 5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
185
+ 5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
186
+ 5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
187
+ 5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
188
+ 5,the cities;,0.22902,0.624545,0.076471,0.012424,24
189
+ 5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
190
+ 5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
191
+ 5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
192
+ 5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
193
+ 5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
194
+ 5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
195
+ 5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
196
+ 5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,32
197
+ 5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,33
198
+ 5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,34
199
+ 5,Mayor,0.311373,0.894848,0.053333,0.012727,35
200
+ 5,New York City,0.287843,0.909091,0.121176,0.013333,36
201
+ 5,London,0.701961,0.909091,0.061569,0.010606,37
202
+ 6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,1
203
+ 6,SisterCities,0.169412,0.03303,0.24,0.028182,2
204
+ 6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
205
+ 6,Toolkit,0.83098,0.072727,0.127451,0.025758,4
206
+ 6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,5
207
+ 6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
208
+ 6,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
209
+ 6,California,0.551373,0.257273,0.136471,0.033333,8
210
+ 6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
211
+ 6,between the,0.464706,0.352727,0.084314,0.009697,10
212
+ 6,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
213
+ 6,"California, USA",0.4,0.397576,0.21098,0.016061,12
214
+ 6,and the,0.48,0.415152,0.053333,0.009091,13
215
+ 6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
216
+ 6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
217
+ 6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
218
+ 6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
219
+ 6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
220
+ 6,purposes:,0.216863,0.516061,0.058039,0.009394,19
221
+ 6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
222
+ 6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
223
+ 6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
224
+ 6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
225
+ 6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
226
+ 6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
227
+ 6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
228
+ 6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
229
+ 6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
230
+ 6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
231
+ 6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
232
+ 6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
233
+ 6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
234
+ 6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
235
+ 6,"California, USA",0.582745,0.765758,0.125098,0.01303,34
236
+ 6,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
237
+ 6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
238
+ 6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
239
+ 6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
240
+ 6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
241
+ 7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,1
242
+ 7,SisterCities,0.169412,0.03303,0.24,0.028485,2
243
+ 7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
244
+ 7,Toolkit,0.83098,0.072727,0.127451,0.025758,4
245
+ 7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,5
246
+ 7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
247
+ 7,adopted by,0.2,0.213333,0.080392,0.013636,7
248
+ 7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
249
+ 7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
250
+ 7,and,0.199608,0.260909,0.026275,0.010606,10
251
+ 7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
252
+ 7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
253
+ 7,ON,0.551765,0.298182,0.026667,0.011515,13
254
+ 7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
255
+ 7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
256
+ 7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
257
+ 7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
258
+ 7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
259
+ 7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
260
+ 7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
261
+ 7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
262
+ 7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
263
+ 7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
264
+ 7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
265
+ 7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
266
+ 7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
267
+ 7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
268
+ 7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
269
+ 7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
270
+ 7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
271
+ 7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
272
+ 7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
273
+ 7,3h.5.,0.593725,0.750606,0.218039,0.06303,33
274
+ 7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,34
275
+ 7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,35
276
+ 7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,36
277
+ 7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,37
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image,page,label,color,xmin,ymin,xmax,ymax,id,text
2
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
3
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
4
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
5
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
6
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
7
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,akaplan@sister-cities.org
8
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
9
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
10
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
11
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
12
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
13
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
14
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
15
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
16
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
17
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
18
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
19
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
20
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
21
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
22
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
23
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
24
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
25
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
26
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
27
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
28
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
29
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
30
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
31
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
32
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
33
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
34
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
35
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
36
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
37
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
38
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
39
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
40
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
41
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
42
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
43
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
44
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
45
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
46
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
47
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
48
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
49
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
50
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
51
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
52
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
53
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
54
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
55
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
56
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
57
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
58
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
59
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
60
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
61
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
62
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
63
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
64
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
65
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
66
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
67
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
68
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
69
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
70
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
71
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
72
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
73
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
74
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
75
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
76
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
77
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv ADDED
The diff for this file is too large to render. See raw diff
 
example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv ADDED
@@ -0,0 +1,923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,text,left,top,width,height,line
2
+ 1,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
3
+ 1,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
4
+ 1,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
5
+ 1,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
6
+ 1,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
7
+ 1,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
8
+ 1,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
9
+ 1,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
10
+ 1,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
11
+ 1,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
12
+ 1,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
13
+ 1,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
14
+ 1,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
15
+ 1,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
16
+ 1,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
17
+ 1,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
18
+ 1,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
19
+ 1,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
20
+ 1,Brooke,0.118431,0.51,0.050588,0.01,19
21
+ 1,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
22
+ 1,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
23
+ 1,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
24
+ 2,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
25
+ 2,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
26
+ 2,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
27
+ 2,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
28
+ 2,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
29
+ 2,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
30
+ 2,London,0.083837,0.125428,0.066102,0.011117,7
31
+ 2,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
32
+ 2,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
33
+ 2,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
34
+ 2,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
35
+ 2,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
36
+ 2,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
37
+ 2,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
38
+ 2,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
39
+ 2,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
40
+ 2,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
41
+ 2,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
42
+ 2,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
43
+ 2,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
44
+ 2,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
45
+ 2,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
46
+ 2,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
47
+ 2,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
48
+ 2,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
49
+ 2,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
50
+ 2,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
51
+ 2,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
52
+ 2,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
53
+ 2,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
54
+ 2,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
55
+ 2,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
56
+ 2,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
57
+ 2,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
58
+ 2,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
59
+ 2,peers.,0.083837,0.652509,0.05401,0.011117,36
60
+ 2,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
61
+ 2,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
62
+ 2,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
63
+ 2,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
64
+ 2,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
65
+ 2,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
66
+ 2,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
67
+ 2,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
68
+ 2,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
69
+ 2,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
70
+ 3,SisterCities,0.169804,0.033333,0.238431,0.028182,1
71
+ 3,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
72
+ 3,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
73
+ 3,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
74
+ 3,Toolkit,0.830588,0.07303,0.126667,0.025152,5
75
+ 3,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
76
+ 3,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
77
+ 3,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
78
+ 3,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
79
+ 3,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
80
+ 3,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
81
+ 3,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
82
+ 3,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
83
+ 3,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
84
+ 3,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
85
+ 3,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
86
+ 3,Friendship City,0.118039,0.372121,0.127059,0.013939,17
87
+ 3,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
88
+ 3,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
89
+ 3,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
90
+ 3,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
91
+ 3,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
92
+ 3,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
93
+ 3,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
94
+ 3,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
95
+ 3,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
96
+ 3,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
97
+ 3,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
98
+ 3,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
99
+ 3,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
100
+ 3,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
101
+ 3,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
102
+ 3,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
103
+ 3,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
104
+ 3,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
105
+ 3,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
106
+ 3,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
107
+ 3,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
108
+ 3,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
109
+ 3,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
110
+ 3,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
111
+ 3,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
112
+ 3,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
113
+ 3,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
114
+ 3,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
115
+ 4,SisterCities,0.169804,0.033333,0.238824,0.028182,1
116
+ 4,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
117
+ 4,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
118
+ 4,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
119
+ 4,Toolkit,0.83098,0.072727,0.127059,0.025455,5
120
+ 4,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
121
+ 4,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
122
+ 4,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
123
+ 4,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
124
+ 4,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
125
+ 4,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
126
+ 4,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
127
+ 4,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
128
+ 4,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
129
+ 4,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
130
+ 4,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
131
+ 4,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
132
+ 4,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
133
+ 4,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
134
+ 4,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
135
+ 4,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
136
+ 4,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
137
+ 4,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
138
+ 4,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
139
+ 4,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
140
+ 4,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
141
+ 4,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
142
+ 4,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
143
+ 4,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
144
+ 4,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
145
+ 4,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
146
+ 4,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
147
+ 4,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
148
+ 4,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
149
+ 4,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
150
+ 4,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
151
+ 4,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
152
+ 4,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
153
+ 4,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
154
+ 4,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
155
+ 4,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
156
+ 4,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
157
+ 4,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
158
+ 4,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
159
+ 4,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
160
+ 4,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
161
+ 5,SisterCities,0.169804,0.033333,0.239216,0.028182,1
162
+ 5,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
163
+ 5,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
164
+ 5,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
165
+ 5,Toolkit,0.83098,0.07303,0.126667,0.025152,5
166
+ 5,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
167
+ 5,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
168
+ 5,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
169
+ 5,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
170
+ 5,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
171
+ 5,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
172
+ 5,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
173
+ 5,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
174
+ 5,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
175
+ 5,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
176
+ 5,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
177
+ 5,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
178
+ 5,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
179
+ 5,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
180
+ 5,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
181
+ 5,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
182
+ 5,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
183
+ 5,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
184
+ 5,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
185
+ 5,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
186
+ 5,for their records.,0.176078,0.550606,0.131373,0.010606,26
187
+ 5,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
188
+ 5,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
189
+ 5,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
190
+ 5,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
191
+ 5,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
192
+ 5,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
193
+ 5,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
194
+ 5,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
195
+ 5,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
196
+ 5,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
197
+ 5,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
198
+ 5,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
199
+ 5,347-8630.,0.117647,0.799394,0.080392,0.010303,39
200
+ 6,SisterCities,0.169412,0.033333,0.239608,0.028485,1
201
+ 6,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
202
+ 6,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
203
+ 6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
204
+ 6,Toolkit,0.830588,0.072727,0.127843,0.025758,5
205
+ 6,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
206
+ 6,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
207
+ 6,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
208
+ 6,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
209
+ 6,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
210
+ 6,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
211
+ 6,AND,0.487843,0.452727,0.048235,0.011212,12
212
+ 6,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
213
+ 6,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
214
+ 6,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
215
+ 6,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
216
+ 6,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
217
+ 6,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
218
+ 6,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
219
+ 6,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
220
+ 6,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
221
+ 6,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
222
+ 6,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
223
+ 6,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
224
+ 6,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
225
+ 6,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
226
+ 6,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
227
+ 6,A,0.344314,0.768485,0.084706,0.030303,28
228
+ 6,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
229
+ 6,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
230
+ 6,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
231
+ 6,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
232
+ 6,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
233
+ 7,SisterCities,0.169412,0.033333,0.239608,0.028485,1
234
+ 7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
235
+ 7,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
236
+ 7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
237
+ 7,Toolkit,0.83098,0.072727,0.127059,0.025758,5
238
+ 7,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
239
+ 7,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
240
+ 7,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
241
+ 7,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
242
+ 7,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
243
+ 7,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
244
+ 7,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
245
+ 7,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
246
+ 7,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
247
+ 7,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
248
+ 7,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
249
+ 7,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
250
+ 7,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
251
+ 7,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
252
+ 7,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
253
+ 7,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
254
+ 7,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
255
+ 7,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
256
+ 7,the cities;,0.22902,0.624545,0.076471,0.012424,24
257
+ 7,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
258
+ 7,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
259
+ 7,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
260
+ 7,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
261
+ 7,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
262
+ 7,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
263
+ 7,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
264
+ 7,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
265
+ 7,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
266
+ 7,Mayor,0.311373,0.894848,0.053333,0.012727,34
267
+ 7,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
268
+ 7,New York City,0.287843,0.909091,0.121176,0.013333,36
269
+ 7,London,0.701961,0.909091,0.061569,0.010606,37
270
+ 8,SisterCities,0.169412,0.03303,0.24,0.028182,1
271
+ 8,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
272
+ 8,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
273
+ 8,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
274
+ 8,Toolkit,0.83098,0.072727,0.127451,0.025758,5
275
+ 8,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
276
+ 8,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
277
+ 8,California,0.551373,0.257273,0.136471,0.033333,8
278
+ 8,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
279
+ 8,between the,0.464706,0.352727,0.084314,0.009697,10
280
+ 8,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
281
+ 8,"California, USA",0.4,0.397576,0.21098,0.016061,12
282
+ 8,and the,0.48,0.415152,0.053333,0.009091,13
283
+ 8,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
284
+ 8,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
285
+ 8,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
286
+ 8,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
287
+ 8,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
288
+ 8,purposes:,0.216863,0.516061,0.058039,0.009394,19
289
+ 8,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
290
+ 8,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
291
+ 8,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
292
+ 8,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
293
+ 8,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
294
+ 8,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
295
+ 8,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
296
+ 8,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
297
+ 8,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
298
+ 8,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
299
+ 8,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
300
+ 8,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
301
+ 8,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
302
+ 8,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
303
+ 8,"California, USA",0.582745,0.765758,0.125098,0.01303,34
304
+ 8,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
305
+ 8,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
306
+ 8,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
307
+ 8,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
308
+ 8,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
309
+ 9,SisterCities,0.169412,0.03303,0.24,0.028485,1
310
+ 9,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
311
+ 9,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
312
+ 9,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
313
+ 9,Toolkit,0.83098,0.072727,0.127451,0.025758,5
314
+ 9,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
315
+ 9,adopted by,0.2,0.213333,0.080392,0.013636,7
316
+ 9,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
317
+ 9,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
318
+ 9,and,0.199608,0.260909,0.026275,0.010606,10
319
+ 9,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
320
+ 9,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
321
+ 9,ON,0.551765,0.298182,0.026667,0.011515,13
322
+ 9,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
323
+ 9,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
324
+ 9,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
325
+ 9,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
326
+ 9,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
327
+ 9,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
328
+ 9,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
329
+ 9,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
330
+ 9,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
331
+ 9,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
332
+ 9,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
333
+ 9,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
334
+ 9,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
335
+ 9,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
336
+ 9,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
337
+ 9,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
338
+ 9,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
339
+ 9,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
340
+ 9,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
341
+ 9,3h.5.,0.593725,0.750606,0.218039,0.06303,33
342
+ 9,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
343
+ 9,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
344
+ 9,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
345
+ 9,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
346
+ 10,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
347
+ 10,agcas,0.726169,0.191722,0.053368,0.011749,2
348
+ 10,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
349
+ 10,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
350
+ 10,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
351
+ 10,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
352
+ 10,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
353
+ 10,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
354
+ 10,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
355
+ 10,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
356
+ 10,Personal Details,0.221568,0.299332,0.095326,0.007744,11
357
+ 10,Summary,0.220832,0.321495,0.048215,0.008278,12
358
+ 10,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
359
+ 10,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
360
+ 10,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
361
+ 10,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
362
+ 10,2008 present,0.220832,0.401602,0.074715,0.008011,17
363
+ 10,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
364
+ 10,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
365
+ 10,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
366
+ 10,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
367
+ 10,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
368
+ 10,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
369
+ 10,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
370
+ 10,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
371
+ 10,dissertation.,0.230033,0.448064,0.057784,0.006676,26
372
+ 10,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
373
+ 10,2000 2007,0.2212,0.467824,0.061833,0.006409,28
374
+ 10,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
375
+ 10,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
376
+ 10,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
377
+ 10,Work History,0.220832,0.509212,0.065513,0.008278,32
378
+ 10,2008 2011,0.220832,0.529506,0.061833,0.006409,33
379
+ 10,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
380
+ 10,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
381
+ 10,Briefly list,0.707766,0.536716,0.045639,0.008011,36
382
+ 10,your relevant,0.70703,0.546061,0.061465,0.008011,37
383
+ 10,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
384
+ 10,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
385
+ 10,duties.,0.707398,0.555674,0.030916,0.006409,40
386
+ 10,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
387
+ 10,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
388
+ 10,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
389
+ 10,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
390
+ 10,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
391
+ 10,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
392
+ 10,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
393
+ 10,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
394
+ 10,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
395
+ 10,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
396
+ 10,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
397
+ 10,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
398
+ 10,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
399
+ 10,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
400
+ 10,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
401
+ 10,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
402
+ 10,2007/2011,0.220832,0.728438,0.055208,0.008011,57
403
+ 10,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
404
+ 10,Interests,0.2212,0.748465,0.043062,0.006676,59
405
+ 10,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
406
+ 10,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
407
+ 11,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
408
+ 11,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
409
+ 11,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
410
+ 11,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
411
+ 11,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
412
+ 11,Effective communication,0.2212,0.265421,0.123298,0.006676,6
413
+ 11,require.,0.468531,0.263017,0.034965,0.008011,7
414
+ 11,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
415
+ 11,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
416
+ 11,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
417
+ 11,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
418
+ 11,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
419
+ 11,Customer service,0.220832,0.335915,0.085388,0.006676,13
420
+ 11,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
421
+ 11,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
422
+ 11,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
423
+ 11,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
424
+ 11,the globe.,0.233714,0.385848,0.049687,0.008278,18
425
+ 11,Teamwork,0.220464,0.406142,0.052632,0.006409,19
426
+ 11,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
427
+ 11,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
428
+ 11,promotion.,0.234082,0.436048,0.05484,0.008545,22
429
+ 11,Administration,0.220464,0.456075,0.075083,0.006409,23
430
+ 11,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
431
+ 11,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
432
+ 11,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
433
+ 11,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
434
+ 11,where you performed them,0.63894,0.472363,0.128082,0.008278,28
435
+ 11,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
436
+ 11,and how you performed,0.639308,0.481709,0.111888,0.008278,30
437
+ 11,them well.,0.63894,0.491055,0.048951,0.006409,31
438
+ 11,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
439
+ 11,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
440
+ 11,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
441
+ 11,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
442
+ 11,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
443
+ 11,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
444
+ 11,Initiative,0.2212,0.576235,0.044166,0.006676,38
445
+ 11,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
446
+ 11,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
447
+ 11,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
448
+ 11,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
449
+ 11,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
450
+ 11,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
451
+ 11,service industries.,0.234082,0.656609,0.088333,0.006943,45
452
+ 11,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
453
+ 11,customers.,0.234082,0.67717,0.054472,0.006142,47
454
+ 11,Language ability,0.2212,0.696395,0.082444,0.008812,48
455
+ 11,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
456
+ 11,Referees,0.2212,0.726569,0.041958,0.006676,50
457
+ 11,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
458
+ 11,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
459
+ 11,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
460
+ 11,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
461
+ 11,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
462
+ 11,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56
463
+ 12,5-Point Networking Email,0.404314,0.050606,0.189804,0.012121,1
464
+ 12,"Steve Dalton, the author of The 2-Hour Job Search believes the perfect networking email is a ""5-Point E-mail"". The five",0.058824,0.086061,0.859608,0.012727,2
465
+ 12,points are as follows:,0.059216,0.10303,0.152941,0.012727,3
466
+ 12,1. 100 words or less,0.088627,0.136667,0.156078,0.010303,4
467
+ 12,2. No mention of jobs (in subject or body),0.088235,0.153333,0.31451,0.012727,5
468
+ 12,"3. Connection goes first (e.g., ND connection)",0.087843,0.170606,0.341569,0.01303,6
469
+ 12,4. Generalize your interest,0.087843,0.187879,0.205098,0.012424,7
470
+ 12,5. Maintain control of the follow up,0.088627,0.204545,0.27098,0.012727,8
471
+ 12,Here's an example of what a 5-Point email would look like:,0.059608,0.255455,0.42549,0.012727,9
472
+ 12,Subject: Notre Dame MBA Student Seeking Your Advice,0.117255,0.289394,0.414118,0.012424,10
473
+ 12,"Dear Mr. Jones,",0.118039,0.323939,0.112549,0.011515,11
474
+ 12,"My name is Brooke Franklin, and I'm a first-year Notre Dame MBA student who found your",0.118431,0.35697,0.661569,0.01303,12
475
+ 12,information in the ND alumni database. May I have 15 minutes of your time to ask you about,0.118039,0.374242,0.677255,0.012727,13
476
+ 12,your experience with IBM? I'm trying to learn more about marketing careers at technology,0.117255,0.391212,0.660784,0.01303,14
477
+ 12,companies and your insights would be very helpful.,0.117647,0.407879,0.373333,0.01303,15
478
+ 12,"I realize this may be a busy time for you, so if we're unable to connect this week, I'll try again",0.118039,0.442121,0.674902,0.012727,16
479
+ 12,next week to see whether that is more convenient.,0.118039,0.459091,0.370588,0.010303,17
480
+ 12,"Thank you for your time,",0.117255,0.492727,0.179216,0.012727,18
481
+ 12,Brooke,0.118431,0.51,0.050588,0.01,19
482
+ 12,The most important part of this email may be the follow-up; an email like this allows you to reach out again in a week if,0.058431,0.543333,0.872157,0.01303,20
483
+ 12,you haven't heard back without feeling like you're bothering the person at the other end. If you don't hear anything,0.058431,0.560606,0.843922,0.01303,21
484
+ 12,"after the second attempt, you can probably cross him/her off your list and move on to the next contact.",0.058824,0.577273,0.755686,0.01303,22
485
+ 13,36 Westmoreland Drive,0.705764,0.026796,0.209996,0.011403,1
486
+ 13,Newcastle upon Tyne,0.723499,0.04333,0.192664,0.013968,2
487
+ 13,NE1 8LT,0.836759,0.059863,0.079807,0.011117,3
488
+ 13,Mr Mark Wilson,0.083837,0.076112,0.138251,0.011403,4
489
+ 13,UK Health Trust,0.083837,0.09236,0.143087,0.011403,5
490
+ 13,18 Whitehall Square,0.084643,0.108609,0.179766,0.013968,6
491
+ 13,London,0.083837,0.125428,0.066102,0.011117,7
492
+ 13,SW1 9LT,0.083837,0.141391,0.083031,0.011403,8
493
+ 13,11th January 2015,0.755744,0.154789,0.161225,0.017389,9
494
+ 13,Dear Mr Wilson,0.083837,0.174173,0.137042,0.011403,10
495
+ 13,Re: Community Health Development Officer [HD/12/2014],0.083837,0.201539,0.544135,0.014253,11
496
+ 13,"I am writing to apply for the above post, as advertised on the Health UK recruitment site. I am",0.08424,0.228905,0.828295,0.014253,12
497
+ 13,a sociology graduate with a 2: 1from Newcastle University. I have relevant health awareness,0.083434,0.245439,0.822249,0.014253,13
498
+ 13,"experience, and I am looking for a position where I can employ my knowledge and skills in",0.083434,0.261973,0.802499,0.013968,14
499
+ 13,support of health and community development. I enclose my CV for your attention.,0.083434,0.277936,0.731963,0.014253,15
500
+ 13,I am eager to work for UK Health Trust because of your ground-breaking work within the field,0.08424,0.305302,0.825877,0.014253,16
501
+ 13,of community health. I became aware of the work of the Trust when carrying out my,0.083434,0.322121,0.744055,0.013968,17
502
+ 13,"dissertation, 'Generational Change in Local Health Awareness, where I researched health",0.083031,0.338084,0.798468,0.014253,18
503
+ 13,awareness of children and elderly people in a deprived location. I referred to a number of,0.083031,0.354618,0.792019,0.013968,19
504
+ 13,publications produced by UK Health Trust and was impressed by the innovative techniques,0.083837,0.371152,0.809351,0.013968,20
505
+ 13,your organisation uses to engage local community members in projects. The Community,0.083031,0.387685,0.788795,0.014253,21
506
+ 13,Health Development Officer position would further develop my existing abilities and my,0.08424,0.403934,0.771463,0.014253,22
507
+ 13,"understanding of community development, allowing me to contribute in a practical way to",0.083837,0.420468,0.789601,0.013968,23
508
+ 13,enhancing the health of disadvantaged people.,0.083434,0.436716,0.415961,0.013968,24
509
+ 13,The volunteer development aspect of the position particularly appeals to me. I have worked,0.083031,0.469213,0.811769,0.014538,25
510
+ 13,"in the voluntary sector, providing services tackling health inequalities and promoting healthy",0.083837,0.485747,0.814994,0.014253,26
511
+ 13,living in Newcastle. I promoted health awareness through one to one sessions and in large,0.083434,0.501995,0.805723,0.014253,27
512
+ 13,"groups and developed interpersonal skills, confidence and patience when engaging and",0.083031,0.518529,0.787183,0.014253,28
513
+ 13,"motivating participants. While raising the group's profile using social media, the local press",0.083434,0.534778,0.804917,0.013968,29
514
+ 13,"and at presentations to youth clubs, faith meetings and care homes I recognised the need to",0.083434,0.551596,0.820637,0.013968,30
515
+ 13,"change my delivery style to suit the audience. As a volunteer teacher in Ghana, I developed",0.083434,0.56756,0.8158,0.014253,31
516
+ 13,communication and team-building skills essential to your advertised role; liaising with,0.083434,0.584094,0.753325,0.013968,32
517
+ 13,colleagues and parents and a lively group of twenty-five 7-8 year olds to arrange a,0.083434,0.600627,0.731963,0.014253,33
518
+ 13,"community event. My retail experience, coupled with my extracurricular activities additionally",0.083434,0.617161,0.822249,0.013968,34
519
+ 13,"enhanced my ability to develop others, as I was responsible for inducting and training my",0.083434,0.633409,0.79081,0.014253,35
520
+ 13,peers.,0.083837,0.652509,0.05401,0.011117,36
521
+ 13,"In relation to the fundraising and budgeting aspect of the role, I have experience of raising",0.08424,0.68244,0.798065,0.014253,37
522
+ 13,"substantial amounts of money through several successful charity events, including a well -",0.083031,0.698404,0.802096,0.014538,38
523
+ 13,attended fashion show. I was also elected Treasurer of NU Sociology Society with,0.083434,0.715222,0.728335,0.014253,39
524
+ 13,responsibility for managing a budget of £3000.,0.083434,0.731471,0.411528,0.014538,40
525
+ 13,The necessity to travel to identify community issues only adds to the appeal of the position. I,0.083031,0.758837,0.82104,0.014253,41
526
+ 13,"enjoy driving, hold a full clean driving licence and I am very interested in relocating to London",0.083434,0.775086,0.828295,0.014538,42
527
+ 13,to work for UK Health Trust.,0.083031,0.791619,0.247481,0.011688,43
528
+ 13,Thank you for considering my application. I look forward to hearing from you.,0.083434,0.824401,0.68158,0.014253,44
529
+ 13,Yours sincerely,0.082628,0.857184,0.138251,0.014253,45
530
+ 13,Rachel Sullivan,0.083837,0.889966,0.137042,0.011403,46
531
+ 14,SisterCities,0.169804,0.033333,0.238431,0.028182,1
532
+ 14,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
533
+ 14,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
534
+ 14,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303,4
535
+ 14,Toolkit,0.830588,0.07303,0.126667,0.025152,5
536
+ 14,Types of Affiliations,0.117255,0.157576,0.241961,0.02,6
537
+ 14,Sister City Relationship,0.117647,0.187273,0.196863,0.013939,7
538
+ 14,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636,8
539
+ 14,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939,9
540
+ 14,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636,10
541
+ 14,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939,11
542
+ 14,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636,12
543
+ 14,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636,13
544
+ 14,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636,14
545
+ 14,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636,15
546
+ 14,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636,16
547
+ 14,Friendship City,0.118039,0.372121,0.127059,0.013939,17
548
+ 14,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636,18
549
+ 14,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242,19
550
+ 14,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636,20
551
+ 14,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636,21
552
+ 14,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636,22
553
+ 14,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333,23
554
+ 14,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636,24
555
+ 14,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636,25
556
+ 14,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636,26
557
+ 14,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333,27
558
+ 14,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727,28
559
+ 14,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636,29
560
+ 14,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333,30
561
+ 14,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939,31
562
+ 14,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242,32
563
+ 14,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636,33
564
+ 14,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636,34
565
+ 14,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939,35
566
+ 14,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939,36
567
+ 14,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636,37
568
+ 14,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939,38
569
+ 14,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242,39
570
+ 14,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333,40
571
+ 14,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939,41
572
+ 14,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636,42
573
+ 14,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333,43
574
+ 14,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636,44
575
+ 14,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636,45
576
+ 15,SisterCities,0.169804,0.033333,0.238824,0.028182,1
577
+ 15,Partnership Agreement,0.516078,0.027879,0.440784,0.032424,2
578
+ 15,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788,3
579
+ 15,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
580
+ 15,Toolkit,0.83098,0.072727,0.127059,0.025455,5
581
+ 15,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333,6
582
+ 15,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333,7
583
+ 15,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333,8
584
+ 15,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636,9
585
+ 15,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333,10
586
+ 15,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333,11
587
+ 15,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333,12
588
+ 15,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333,13
589
+ 15,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636,14
590
+ 15,General Guidelines,0.118039,0.295152,0.231765,0.016061,15
591
+ 15,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636,16
592
+ 15,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636,17
593
+ 15,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636,18
594
+ 15,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636,19
595
+ 15,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636,20
596
+ 15,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333,21
597
+ 15,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636,22
598
+ 15,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636,23
599
+ 15,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939,24
600
+ 15,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636,25
601
+ 15,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636,26
602
+ 15,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636,27
603
+ 15,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939,28
604
+ 15,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333,29
605
+ 15,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636,30
606
+ 15,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636,31
607
+ 15,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939,32
608
+ 15,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636,33
609
+ 15,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939,34
610
+ 15,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636,35
611
+ 15,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333,36
612
+ 15,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636,37
613
+ 15,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636,38
614
+ 15,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636,39
615
+ 15,with very specific tasks.,0.176078,0.750606,0.190196,0.013333,40
616
+ 15,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636,41
617
+ 15,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333,42
618
+ 15,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939,43
619
+ 15,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636,44
620
+ 15,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939,45
621
+ 15,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636,46
622
+ 16,SisterCities,0.169804,0.033333,0.239216,0.028182,1
623
+ 16,Partnership Agreement,0.516078,0.027879,0.441176,0.032121,2
624
+ 16,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788,3
625
+ 16,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303,4
626
+ 16,Toolkit,0.83098,0.07303,0.126667,0.025152,5
627
+ 16,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333,6
628
+ 16,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333,7
629
+ 16,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636,8
630
+ 16,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636,9
631
+ 16,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636,10
632
+ 16,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333,11
633
+ 16,and cooperation.,0.176471,0.25697,0.13451,0.013333,12
634
+ 16,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333,13
635
+ 16,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636,14
636
+ 16,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333,15
637
+ 16,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636,16
638
+ 16,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636,17
639
+ 16,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636,18
640
+ 16,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636,19
641
+ 16,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333,20
642
+ 16,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636,21
643
+ 16,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333,22
644
+ 16,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636,23
645
+ 16,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636,24
646
+ 16,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636,25
647
+ 16,for their records.,0.176078,0.550606,0.131373,0.010606,26
648
+ 16,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636,27
649
+ 16,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636,28
650
+ 16,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333,29
651
+ 16,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939,30
652
+ 16,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636,31
653
+ 16,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242,32
654
+ 16,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515,33
655
+ 16,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939,34
656
+ 16,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636,35
657
+ 16,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636,36
658
+ 16,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636,37
659
+ 16,sending it to our Membership Director at akaplan@sister-cities.org or contacting us at (202),0.117647,0.782727,0.732157,0.013636,38
660
+ 16,347-8630.,0.117647,0.799394,0.080392,0.010303,39
661
+ 17,SisterCities,0.169412,0.033333,0.239608,0.028485,1
662
+ 17,Partnership Agreement,0.516471,0.027879,0.440784,0.032727,2
663
+ 17,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091,3
664
+ 17,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
665
+ 17,Toolkit,0.830588,0.072727,0.127843,0.025758,5
666
+ 17,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394,6
667
+ 17,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667,7
668
+ 17,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727,8
669
+ 17,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121,9
670
+ 17,BETWEEN,0.454902,0.413636,0.110588,0.011212,10
671
+ 17,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939,11
672
+ 17,AND,0.487843,0.452727,0.048235,0.011212,12
673
+ 17,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848,13
674
+ 17,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303,14
675
+ 17,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727,15
676
+ 17,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727,16
677
+ 17,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424,17
678
+ 17,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424,18
679
+ 17,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303,19
680
+ 17,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121,20
681
+ 17,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424,21
682
+ 17,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121,22
683
+ 17,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758,23
684
+ 17,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303,24
685
+ 17,relationship became effective.,0.221569,0.705152,0.217647,0.012424,25
686
+ 17,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303,26
687
+ 17,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727,27
688
+ 17,A,0.344314,0.768485,0.084706,0.030303,28
689
+ 17,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909,29
690
+ 17,Lee P.Brown,0.729412,0.806364,0.118824,0.010303,30
691
+ 17,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727,31
692
+ 17,Mayor of Houston,0.704706,0.823333,0.166667,0.012424,32
693
+ 17,&Town Planning,0.324314,0.841212,0.155686,0.012424,33
694
+ 18,SisterCities,0.169412,0.033333,0.239608,0.028485,1
695
+ 18,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
696
+ 18,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091,3
697
+ 18,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
698
+ 18,Toolkit,0.83098,0.072727,0.127059,0.025758,5
699
+ 18,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697,6
700
+ 18,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697,7
701
+ 18,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303,8
702
+ 18,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818,9
703
+ 18,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333,10
704
+ 18,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242,11
705
+ 18,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636,12
706
+ 18,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152,13
707
+ 18,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455,14
708
+ 18,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152,15
709
+ 18,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152,16
710
+ 18,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455,17
711
+ 18,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848,18
712
+ 18,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939,19
713
+ 18,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545,20
714
+ 18,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848,21
715
+ 18,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152,22
716
+ 18,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242,23
717
+ 18,the cities;,0.22902,0.624545,0.076471,0.012424,24
718
+ 18,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152,25
719
+ 18,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636,26
720
+ 18,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636,27
721
+ 18,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848,28
722
+ 18,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545,29
723
+ 18,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152,30
724
+ 18,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545,31
725
+ 18,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636,32
726
+ 18,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909,33
727
+ 18,Mayor,0.311373,0.894848,0.053333,0.012727,34
728
+ 18,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091,35
729
+ 18,New York City,0.287843,0.909091,0.121176,0.013333,36
730
+ 18,London,0.701961,0.909091,0.061569,0.010606,37
731
+ 19,SisterCities,0.169412,0.03303,0.24,0.028182,1
732
+ 19,Partnership Agreement,0.515686,0.027576,0.441961,0.03303,2
733
+ 19,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091,3
734
+ 19,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333,4
735
+ 19,Toolkit,0.83098,0.072727,0.127451,0.025758,5
736
+ 19,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364,6
737
+ 19,City of Long Beach,0.388627,0.196667,0.476471,0.066364,7
738
+ 19,California,0.551373,0.257273,0.136471,0.033333,8
739
+ 19,Sister City Agreement,0.321961,0.305455,0.378431,0.035152,9
740
+ 19,between the,0.464706,0.352727,0.084314,0.009697,10
741
+ 19,City of Long Beach,0.38,0.378485,0.252549,0.01697,11
742
+ 19,"California, USA",0.4,0.397576,0.21098,0.016061,12
743
+ 19,and the,0.48,0.415152,0.053333,0.009091,13
744
+ 19,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697,14
745
+ 19,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152,15
746
+ 19,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121,16
747
+ 19,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303,17
748
+ 19,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121,18
749
+ 19,purposes:,0.216863,0.516061,0.058039,0.009394,19
750
+ 19,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424,20
751
+ 19,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424,21
752
+ 19,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424,22
753
+ 19,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121,23
754
+ 19,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121,24
755
+ 19,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121,25
756
+ 19,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727,26
757
+ 19,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697,27
758
+ 19,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727,28
759
+ 19,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424,29
760
+ 19,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121,30
761
+ 19,STATE OFFICE,0.276471,0.713636,0.050588,0.048788,31
762
+ 19,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636,32
763
+ 19,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636,33
764
+ 19,"California, USA",0.582745,0.765758,0.125098,0.01303,34
765
+ 19,10.2aulus,0.490588,0.771818,0.220392,0.062424,35
766
+ 19,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333,36
767
+ 19,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636,37
768
+ 19,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818,38
769
+ 19,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303,39
770
+ 20,SisterCities,0.169412,0.03303,0.24,0.028485,1
771
+ 20,Partnership Agreement,0.516078,0.027879,0.441176,0.032424,2
772
+ 20,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091,3
773
+ 20,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333,4
774
+ 20,Toolkit,0.83098,0.072727,0.127451,0.025758,5
775
+ 20,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939,6
776
+ 20,adopted by,0.2,0.213333,0.080392,0.013636,7
777
+ 20,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424,8
778
+ 20,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515,9
779
+ 20,and,0.199608,0.260909,0.026275,0.010606,10
780
+ 20,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212,11
781
+ 20,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212,12
782
+ 20,ON,0.551765,0.298182,0.026667,0.011515,13
783
+ 20,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848,14
784
+ 20,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152,15
785
+ 20,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455,16
786
+ 20,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848,17
787
+ 20,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242,18
788
+ 20,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848,19
789
+ 20,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242,20
790
+ 20,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152,21
791
+ 20,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242,22
792
+ 20,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242,23
793
+ 20,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455,24
794
+ 20,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636,25
795
+ 20,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455,26
796
+ 20,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606,27
797
+ 20,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758,28
798
+ 20,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242,29
799
+ 20,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848,30
800
+ 20,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152,31
801
+ 20,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636,32
802
+ 20,3h.5.,0.593725,0.750606,0.218039,0.06303,33
803
+ 20,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606,34
804
+ 20,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818,35
805
+ 20,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606,36
806
+ 20,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303,37
807
+ 21,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308796,0.008545,1
808
+ 21,agcas,0.726169,0.191722,0.053368,0.011749,2
809
+ 21,Example of a skills-based CV,0.3894,0.205874,0.224144,0.011482,3
810
+ 21,ASHLEY GILL,0.459698,0.246195,0.082812,0.008278,4
811
+ 21,3 Lappage Court,0.2212,0.259012,0.080972,0.008545,5
812
+ 21,Telephone: 01882 652349,0.592565,0.259012,0.129555,0.008278,6
813
+ 21,"Tyler Green, Bucks.",0.220464,0.269159,0.092381,0.008278,7
814
+ 21,Mobile: 07717 121824,0.593669,0.269159,0.112992,0.006676,8
815
+ 21,HP8 4JD,0.2212,0.279306,0.040486,0.006409,9
816
+ 21,Email: ashleygill2023@gotmail.com,0.594038,0.279039,0.178874,0.008545,10
817
+ 21,Personal Details,0.221568,0.299332,0.095326,0.007744,11
818
+ 21,Summary,0.220832,0.321495,0.048215,0.008278,12
819
+ 21,Business studies with Spanish undergraduate.,0.273463,0.340988,0.229297,0.008812,13
820
+ 21,Ability to speak French and Spanish.,0.272727,0.351135,0.179242,0.008545,14
821
+ 21,Extensive business experience including an internship with Top Choice Holidays.,0.273095,0.361015,0.398233,0.008812,15
822
+ 21,Education And Qualifications,0.2212,0.381041,0.144277,0.008278,16
823
+ 21,2008 present,0.220832,0.401602,0.074715,0.008011,17
824
+ 21,Buckinghamshire Edge University,0.386824,0.401068,0.167096,0.008545,18
825
+ 21,BA International Business Studies with Spanish (expected 2:1),0.386824,0.410681,0.308796,0.008812,19
826
+ 21,Relate your degree to,0.230033,0.420027,0.100847,0.008278,20
827
+ 21,Study semester at The University of Valloid (Spain).,0.399338,0.420828,0.252852,0.008812,21
828
+ 21,the job by listing your,0.229665,0.429105,0.101583,0.008278,22
829
+ 21,Six-month work placement in Madrid.,0.399338,0.431242,0.188811,0.008545,23
830
+ 21,relevant modules/,0.230033,0.438718,0.085388,0.007744,24
831
+ 21,Relevant modules included: Business Planning; Sales Promotion and,0.399338,0.441389,0.338241,0.008545,25
832
+ 21,dissertation.,0.230033,0.448064,0.057784,0.006676,26
833
+ 21,Marketing; and Business Operations Management.,0.398969,0.451268,0.25322,0.008812,27
834
+ 21,2000 2007,0.2212,0.467824,0.061833,0.006409,28
835
+ 21,Freebridge School,0.386824,0.46729,0.087965,0.008545,29
836
+ 21,"A-Levels: Business Studies (B), French (C)",0.386088,0.476903,0.200221,0.008812,30
837
+ 21,"8 GCSEs including Maths, English, Spanish and French",0.386824,0.487583,0.266838,0.008545,31
838
+ 21,Work History,0.220832,0.509212,0.065513,0.008278,32
839
+ 21,2008 2011,0.220832,0.529506,0.061833,0.006409,33
840
+ 21,Buckinghamshire Edge University Librarian/tour guide,0.386824,0.528972,0.277144,0.008812,34
841
+ 21,General administrative and customer service roles.,0.399338,0.539119,0.25138,0.006676,35
842
+ 21,Briefly list,0.707766,0.536716,0.045639,0.008011,36
843
+ 21,your relevant,0.70703,0.546061,0.061465,0.008011,37
844
+ 21,2011 (Feb-Aug),0.2212,0.55514,0.078027,0.008812,38
845
+ 21,Audigest S.A. (Madrid) - Audit Assistant,0.386456,0.554873,0.199485,0.009079,39
846
+ 21,duties.,0.707398,0.555674,0.030916,0.006409,40
847
+ 21,Six months' work experience in an international bank.,0.399338,0.565287,0.267575,0.008545,41
848
+ 21,Liaising with colleagues and clients in English and Spanish.,0.399338,0.575434,0.292602,0.008545,42
849
+ 21,2010 (June-Dec),0.220832,0.591188,0.082444,0.008278,43
850
+ 21,Finsbury's supermarket (Hazelbridge) — Supervisor,0.386824,0.591188,0.250644,0.008812,44
851
+ 21,Managing a small team.,0.398969,0.601602,0.121089,0.008545,45
852
+ 21,Customer service in a busy competitive environment.,0.398969,0.611215,0.264262,0.008545,46
853
+ 21,2010 (Jan-Aug),0.2212,0.627236,0.077291,0.008812,47
854
+ 21,Top Choice Holidays and Flights Ltd (Low Wycombe),0.386088,0.627503,0.257637,0.008812,48
855
+ 21,Financial Assistant/Supervisor,0.386824,0.637383,0.15127,0.008812,49
856
+ 21,Working in a range of teams to manage complex financial processes.,0.398969,0.64753,0.341921,0.008812,50
857
+ 21,2007 (Jul-Aug),0.220832,0.663284,0.074347,0.008812,51
858
+ 21,Dogs Protection League - General Assistant,0.386824,0.663818,0.216783,0.008812,52
859
+ 21,Dealing with enquiries and selling packages to a range of clients.,0.399706,0.673431,0.321678,0.009079,53
860
+ 21,2006 (Jan-Dec),0.220832,0.689453,0.076187,0.009079,54
861
+ 21,McHenry's Restaurant (Low Wycombe) - Supervisor,0.386456,0.68972,0.256533,0.009079,55
862
+ 21,Voluntary Experience,0.220464,0.708411,0.106367,0.008545,56
863
+ 21,2007/2011,0.220832,0.728438,0.055208,0.008011,57
864
+ 21,Teaching English in Mexico/Spain,0.386088,0.727904,0.167832,0.009079,58
865
+ 21,Interests,0.2212,0.748465,0.043062,0.006676,59
866
+ 21,Active member of University Business Club — Winner of the 'Bucks Best Business Pitch' award in 2010 Enterprise,0.220464,0.768224,0.556864,0.009079,60
867
+ 21,"week, judged by Michael Eavis.",0.220464,0.778104,0.15311,0.008812,61
868
+ 22,Skills_based_CV.qxd 5/8/11 3:55 pm Page,0.17777,0.135381,0.308428,0.008545,1
869
+ 22,Make sure you carefully assess,0.468531,0.23498,0.142068,0.008011,2
870
+ 22,Skills And Achievements,0.220832,0.245394,0.121457,0.006676,3
871
+ 22,the job advert/job description,0.468163,0.244326,0.139124,0.008278,4
872
+ 22,and address all the skills they,0.468531,0.253672,0.13618,0.008278,5
873
+ 22,Effective communication,0.2212,0.265421,0.123298,0.006676,6
874
+ 22,require.,0.468531,0.263017,0.034965,0.008011,7
875
+ 22,"Able to communicate effectively with a wide range of clients and colleagues, by showing interest, carefully",0.233714,0.275567,0.530364,0.008545,8
876
+ 22,"listening to needs and appropriately adjusting my message, as demonstrated during my time at Finsbury's",0.23445,0.285447,0.528892,0.008812,9
877
+ 22,Supermarket.,0.234082,0.295861,0.066618,0.008278,10
878
+ 22,Strong presentation skills and confidence demonstrated by experience of delivering presentations in different,0.23445,0.305474,0.543614,0.008812,11
879
+ 22,languages to groups of five to fifty.,0.234082,0.315621,0.172617,0.008812,12
880
+ 22,Customer service,0.220832,0.335915,0.085388,0.006676,13
881
+ 22,Ability to quickly build rapport with customers and calmly deal with any problems as shown during my retail,0.233714,0.345527,0.541038,0.008812,14
882
+ 22,experience in high pressure environments.,0.234082,0.355941,0.210526,0.008278,15
883
+ 22,"Capacity to maintain professional relationships through email and other written correspondence, for example,",0.234082,0.365554,0.548767,0.008812,16
884
+ 22,"at Audigest in Madrid, where I built longstanding business relationships with customers and colleagues across",0.233714,0.375701,0.549871,0.008812,17
885
+ 22,the globe.,0.233714,0.385848,0.049687,0.008278,18
886
+ 22,Teamwork,0.220464,0.406142,0.052632,0.006409,19
887
+ 22,"At Top Choice Holidays demonstrated excellent teamwork skills in a busy financial environment, such as an",0.233346,0.415754,0.532573,0.008812,20
888
+ 22,"ability to listen to clients and managers, perform my role to a high level and support colleagues, resulting in",0.234082,0.425634,0.535885,0.008812,21
889
+ 22,promotion.,0.234082,0.436048,0.05484,0.008545,22
890
+ 22,Administration,0.220464,0.456075,0.075083,0.006409,23
891
+ 22,Prove you have each of the,0.639676,0.453672,0.123666,0.008278,24
892
+ 22,"Excellent ability to plan ahead and manage time effectively, for example,",0.23445,0.465688,0.360692,0.008812,25
893
+ 22,skills required by outlining,0.63894,0.463017,0.12293,0.008278,26
894
+ 22,managing complex roles during my internship at Top Choice Holidays.,0.23445,0.476101,0.346338,0.008545,27
895
+ 22,where you performed them,0.63894,0.472363,0.128082,0.008278,28
896
+ 22,Gathered data from a wide range of sources during my dissertation,0.234082,0.485714,0.334928,0.008812,29
897
+ 22,and how you performed,0.639308,0.481709,0.111888,0.008278,30
898
+ 22,them well.,0.63894,0.491055,0.048951,0.006409,31
899
+ 22,"whilst balancing my other studies and two jobs, resulting in a 73% grade.",0.233346,0.495861,0.365109,0.008812,32
900
+ 22,Experience of travellers' needs,0.2212,0.515888,0.150534,0.008545,33
901
+ 22,Recent travel consultancy experience gives me an in-depth understanding of the expectations of holiday,0.23445,0.525768,0.518955,0.008812,34
902
+ 22,customers and the competitive nature of the industry.,0.234082,0.535915,0.269047,0.008812,35
903
+ 22,International travel experience and language ability give me an empathy with travellers and a passion for,0.234082,0.545794,0.524107,0.008812,36
904
+ 22,helping them find a unique holiday experience.,0.234082,0.555941,0.23445,0.008812,37
905
+ 22,Initiative,0.2212,0.576235,0.044166,0.006676,38
906
+ 22,Self-funding an evening course in bookkeeping during my first accountancy role demonstrated my ability to,0.234082,0.585848,0.535149,0.008812,39
907
+ 22,plan ahead and take control of my career.,0.23445,0.595995,0.205006,0.008545,40
908
+ 22,Successful study and work in Spain and Mexico show that I can creatively develop my skills and experience and,0.234082,0.605874,0.551711,0.008545,41
909
+ 22,adapt to new and different environments.,0.234082,0.616288,0.208686,0.008278,42
910
+ 22,Sales knowledge,0.220464,0.636315,0.083916,0.008011,43
911
+ 22,Wide experience of financial roles gives me an awareness of the tight monetary pressures which drive UK,0.234082,0.645928,0.525212,0.009346,44
912
+ 22,service industries.,0.234082,0.656609,0.088333,0.006943,45
913
+ 22,Raised sales at The Dogs Protection League by 12% by up selling add-on packages to new and existing,0.23445,0.665955,0.505705,0.009079,46
914
+ 22,customers.,0.234082,0.67717,0.054472,0.006142,47
915
+ 22,Language ability,0.2212,0.696395,0.082444,0.008812,48
916
+ 22,"Spanish fluency obtained working overseas, French semi-fluent.",0.233714,0.706008,0.323151,0.009079,49
917
+ 22,Referees,0.2212,0.726569,0.041958,0.006676,50
918
+ 22,Include all your referee details including their email and,0.351859,0.722029,0.259109,0.008545,51
919
+ 22,phone number (but ask for their permission first).,0.352227,0.731108,0.230401,0.008545,52
920
+ 22,"Professional: Mr. Jose Andreas, Management Accountant, Audigest, Avenida de Concha Espina 2, Madrid, ES-",0.2212,0.746328,0.537725,0.008812,53
921
+ 22,"28036, +34 91 398 5476, j.andreas@audigest.es",0.2212,0.756475,0.238498,0.008278,54
922
+ 22,"Academic: Dr. Jane Luffle, Personal Tutor, Buckinghamshire Edge University, Due Road, Low Wycombe, Bucks,",0.220464,0.776502,0.536621,0.008812,55
923
+ 22,"HD15 3DL, 01628 435 6784, j.luffle@bedge.ac.uk",0.2212,0.786382,0.244755,0.008545,56
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,text,left,top,width,height,line
2
+ 1,Example of emails sent to a professor before applying:,0.147059,0.093434,0.426471,0.013889,1
3
+ 1,Fwd: Prospective Graduate Student,0.145425,0.128788,0.277778,0.013889,2
4
+ 1,"Dr. Kornbluth,",0.147059,0.162879,0.114379,0.012626,3
5
+ 1,I am a senior biology major at the University of Notre Dame. I am applying to the CMB,0.147059,0.198232,0.689542,0.013889,4
6
+ 1,program and am very interested in your work. After glancing at a few of your recent,0.145425,0.214646,0.660131,0.013889,5
7
+ 1,papers and your research summary I find your work with apoptosis very interesting. Will,0.145425,0.232323,0.697712,0.013889,6
8
+ 1,"you be taking on new students next year? If I am invited to interview, is there any way",0.145425,0.25,0.683007,0.013889,7
9
+ 1,you will be able to meet with me?,0.145425,0.267677,0.264706,0.013889,8
10
+ 1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.147059,0.30303,0.69281,0.013889,9
11
+ 1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.147059,0.320707,0.697712,0.013889,10
12
+ 1,initiate Muller glia division post-light damage. My first research project was,0.147059,0.338384,0.598039,0.013889,11
13
+ 1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.147059,0.354798,0.637255,0.013889,12
14
+ 1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.372475,0.604575,0.013889,13
15
+ 1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.390152,0.689542,0.013889,14
16
+ 1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.147059,0.407828,0.635621,0.013889,15
17
+ 1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.425505,0.673203,0.013889,16
18
+ 1,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.443182,0.661765,0.013889,17
19
+ 1,transgenic line during retinal development and regeneration.,0.145425,0.459596,0.472222,0.013889,18
20
+ 1,Please find my CV attached.,0.145425,0.496212,0.222222,0.013889,19
21
+ 1,"Thank you for your time,",0.145425,0.531566,0.196078,0.013889,20
22
+ 1,--Lauren Lilley,0.147059,0.566919,0.119281,0.013889,21
23
+ 1,"Dr. Poss,",0.145425,0.637626,0.070261,0.012626,22
24
+ 1,I am a senior biology major at the University of Notre Dame. I am applying to your,0.145425,0.671717,0.655229,0.013889,23
25
+ 1,graduate program and am very interested in your work. After glancing at a few of your,0.145425,0.689394,0.679739,0.013889,24
26
+ 1,recent papers and your research summary I find your research greatly coincides with my,0.145425,0.707071,0.69281,0.013889,25
27
+ 1,research experiences and interests. Will you be taking on new students next year?,0.145425,0.723485,0.643791,0.015152,26
28
+ 1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.145425,0.760101,0.69281,0.013889,27
29
+ 1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.145425,0.777778,0.699346,0.013889,28
30
+ 1,initiate Muller glia division post-light damage. My first research project was,0.145425,0.795455,0.598039,0.013889,29
31
+ 1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.145425,0.811869,0.638889,0.013889,30
32
+ 1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.829545,0.604575,0.013889,31
33
+ 1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.847222,0.691176,0.013889,32
34
+ 1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.145425,0.864899,0.635621,0.013889,33
35
+ 1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.881313,0.673203,0.013889,34
36
+ 2,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.093434,0.661765,0.013889,1
37
+ 2,transgenic line during retinal development and regeneration.,0.145425,0.111111,0.472222,0.013889,2
38
+ 2,Please find my CV attached.,0.145425,0.146465,0.222222,0.013889,3
39
+ 2,"Thank you for your time,",0.145425,0.181818,0.196078,0.013889,4
40
+ 2,--Lauren Lilley,0.147059,0.218434,0.119281,0.013889,5
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
2
+ 1,1,Example,0.147059,0.093434,0.215686,0.107323,,,,,
3
+ 1,1,of,0.220588,0.093434,0.240196,0.104798,,,,,
4
+ 1,1,emails,0.24183,0.093434,0.292484,0.104798,,,,,
5
+ 1,1,sent,0.297386,0.094697,0.330065,0.104798,,,,,
6
+ 1,1,to,0.334967,0.094697,0.349673,0.104798,,,,,
7
+ 1,1,a,0.354575,0.097222,0.362745,0.104798,,,,,
8
+ 1,1,professor,0.367647,0.093434,0.441176,0.108586,,,,,
9
+ 1,1,before,0.446078,0.093434,0.496732,0.104798,,,,,
10
+ 1,1,applying:,0.501634,0.093434,0.573529,0.107323,,,,,
11
+ 1,2,Fwd:,0.145425,0.128788,0.184641,0.140152,,,,,
12
+ 1,2,Prospective,0.191176,0.128788,0.28268,0.142677,,,,,
13
+ 1,2,Graduate,0.287582,0.128788,0.359477,0.140152,,,,,
14
+ 1,2,Student,0.364379,0.128788,0.424837,0.140152,,,,,
15
+ 1,3,Dr.,0.147059,0.162879,0.171569,0.174242,,,,,
16
+ 1,3,"Kornbluth,",0.176471,0.162879,0.261438,0.176768,,,,,
17
+ 1,4,I,0.147059,0.198232,0.153595,0.209596,,,,,
18
+ 1,4,am,0.158497,0.200758,0.181373,0.209596,,,,,
19
+ 1,4,a,0.186275,0.20202,0.194444,0.209596,,,,,
20
+ 1,4,senior,0.199346,0.198232,0.248366,0.209596,,,,,
21
+ 1,4,biology,0.253268,0.198232,0.312092,0.212121,,,,,
22
+ 1,4,major,0.316993,0.198232,0.364379,0.212121,,,,,
23
+ 1,4,at,0.367647,0.199495,0.382353,0.209596,,,,,
24
+ 1,4,the,0.387255,0.198232,0.411765,0.209596,,,,,
25
+ 1,4,University,0.416667,0.198232,0.5,0.212121,,,,,
26
+ 1,4,of,0.504902,0.198232,0.522876,0.209596,,,,,
27
+ 1,4,Notre,0.52451,0.198232,0.570261,0.209596,,,,,
28
+ 1,4,Dame.,0.575163,0.198232,0.625817,0.209596,,,,,
29
+ 1,4,I,0.632353,0.198232,0.637255,0.209596,,,,,
30
+ 1,4,am,0.643791,0.200758,0.666667,0.209596,,,,,
31
+ 1,4,applying,0.671569,0.198232,0.740196,0.212121,,,,,
32
+ 1,4,to,0.745098,0.199495,0.759804,0.209596,,,,,
33
+ 1,4,the,0.764706,0.198232,0.789216,0.209596,,,,,
34
+ 1,4,CMB,0.794118,0.198232,0.836601,0.209596,,,,,
35
+ 1,5,program,0.145425,0.218434,0.212418,0.229798,,,,,
36
+ 1,5,and,0.21732,0.215909,0.245098,0.227273,,,,,
37
+ 1,5,am,0.25,0.218434,0.27451,0.227273,,,,,
38
+ 1,5,very,0.279412,0.218434,0.313725,0.229798,,,,,
39
+ 1,5,interested,0.320261,0.214646,0.395425,0.22601,,,,,
40
+ 1,5,in,0.400327,0.214646,0.416667,0.22601,,,,,
41
+ 1,5,your,0.419935,0.218434,0.457516,0.229798,,,,,
42
+ 1,5,work.,0.460784,0.214646,0.506536,0.227273,,,,,
43
+ 1,5,After,0.511438,0.214646,0.553922,0.227273,,,,,
44
+ 1,5,glancing,0.55719,0.215909,0.625817,0.229798,,,,,
45
+ 1,5,at,0.630719,0.217172,0.645425,0.227273,,,,,
46
+ 1,5,a,0.650327,0.218434,0.658497,0.227273,,,,,
47
+ 1,5,few,0.663399,0.214646,0.69281,0.22601,,,,,
48
+ 1,5,of,0.697712,0.214646,0.715686,0.227273,,,,,
49
+ 1,5,your,0.718954,0.218434,0.754902,0.229798,,,,,
50
+ 1,5,recent,0.759804,0.217172,0.80719,0.22601,,,,,
51
+ 1,6,papers,0.145425,0.236111,0.197712,0.247475,,,,,
52
+ 1,6,and,0.202614,0.232323,0.230392,0.243687,,,,,
53
+ 1,6,your,0.235294,0.236111,0.271242,0.247475,,,,,
54
+ 1,6,research,0.276144,0.232323,0.341503,0.243687,,,,,
55
+ 1,6,summary,0.346405,0.236111,0.419935,0.247475,,,,,
56
+ 1,6,I,0.424837,0.232323,0.431373,0.243687,,,,,
57
+ 1,6,find,0.436275,0.232323,0.46732,0.243687,,,,,
58
+ 1,6,your,0.472222,0.236111,0.50817,0.247475,,,,,
59
+ 1,6,work,0.513072,0.232323,0.553922,0.243687,,,,,
60
+ 1,6,with,0.558824,0.232323,0.593137,0.243687,,,,,
61
+ 1,6,apoptosis,0.598039,0.233586,0.671569,0.247475,,,,,
62
+ 1,6,very,0.678105,0.236111,0.712418,0.247475,,,,,
63
+ 1,6,interesting.,0.71732,0.232323,0.803922,0.247475,,,,,
64
+ 1,6,Will,0.810458,0.232323,0.844771,0.243687,,,,,
65
+ 1,7,you,0.145425,0.253788,0.174837,0.263889,,,,,
66
+ 1,7,be,0.179739,0.25,0.199346,0.261364,,,,,
67
+ 1,7,taking,0.204248,0.25,0.253268,0.265152,,,,,
68
+ 1,7,on,0.25817,0.253788,0.277778,0.261364,,,,,
69
+ 1,7,new,0.28268,0.253788,0.315359,0.261364,,,,,
70
+ 1,7,students,0.320261,0.25,0.383987,0.261364,,,,,
71
+ 1,7,next,0.388889,0.251263,0.423203,0.261364,,,,,
72
+ 1,7,year?,0.428105,0.25,0.470588,0.263889,,,,,
73
+ 1,7,If,0.480392,0.25,0.495098,0.261364,,,,,
74
+ 1,7,I,0.498366,0.25,0.504902,0.261364,,,,,
75
+ 1,7,am,0.509804,0.253788,0.534314,0.261364,,,,,
76
+ 1,7,invited,0.539216,0.25,0.593137,0.261364,,,,,
77
+ 1,7,to,0.598039,0.251263,0.612745,0.261364,,,,,
78
+ 1,7,"interview,",0.617647,0.25,0.696078,0.263889,,,,,
79
+ 1,7,is,0.702614,0.25,0.714052,0.261364,,,,,
80
+ 1,7,there,0.718954,0.25,0.759804,0.261364,,,,,
81
+ 1,7,any,0.763072,0.253788,0.792484,0.263889,,,,,
82
+ 1,7,way,0.797386,0.253788,0.830065,0.263889,,,,,
83
+ 1,8,you,0.145425,0.271465,0.176471,0.281566,,,,,
84
+ 1,8,will,0.179739,0.267677,0.210784,0.27904,,,,,
85
+ 1,8,be,0.215686,0.267677,0.235294,0.27904,,,,,
86
+ 1,8,able,0.238562,0.267677,0.272876,0.27904,,,,,
87
+ 1,8,to,0.276144,0.268939,0.292484,0.27904,,,,,
88
+ 1,8,meet,0.297386,0.268939,0.334967,0.27904,,,,,
89
+ 1,8,with,0.339869,0.267677,0.375817,0.27904,,,,,
90
+ 1,8,me?,0.380719,0.267677,0.411765,0.27904,,,,,
91
+ 1,9,I,0.147059,0.30303,0.151961,0.314394,,,,,
92
+ 1,9,have,0.156863,0.30303,0.194444,0.314394,,,,,
93
+ 1,9,worked,0.199346,0.30303,0.25817,0.314394,,,,,
94
+ 1,9,on,0.263072,0.306818,0.28268,0.314394,,,,,
95
+ 1,9,several,0.287582,0.30303,0.343137,0.314394,,,,,
96
+ 1,9,different,0.348039,0.30303,0.416667,0.314394,,,,,
97
+ 1,9,research,0.419935,0.30303,0.485294,0.314394,,,,,
98
+ 1,9,projects,0.490196,0.30303,0.552288,0.318182,,,,,
99
+ 1,9,as,0.558824,0.306818,0.573529,0.314394,,,,,
100
+ 1,9,an,0.580065,0.306818,0.598039,0.314394,,,,,
101
+ 1,9,undergraduate,0.602941,0.30303,0.714052,0.318182,,,,,
102
+ 1,9,in,0.718954,0.30303,0.735294,0.314394,,,,,
103
+ 1,9,Dr.,0.740196,0.30303,0.764706,0.314394,,,,,
104
+ 1,9,David,0.769608,0.30303,0.816993,0.314394,,,,,
105
+ 1,9,R.,0.823529,0.30303,0.839869,0.314394,,,,,
106
+ 1,10,Hyde's,0.147059,0.320707,0.199346,0.334596,,,,,
107
+ 1,10,lab,0.204248,0.320707,0.228758,0.332071,,,,,
108
+ 1,10,at,0.23366,0.32197,0.248366,0.332071,,,,,
109
+ 1,10,the,0.251634,0.320707,0.276144,0.332071,,,,,
110
+ 1,10,University,0.281046,0.320707,0.364379,0.334596,,,,,
111
+ 1,10,of,0.369281,0.320707,0.387255,0.332071,,,,,
112
+ 1,10,Notre,0.390523,0.320707,0.434641,0.332071,,,,,
113
+ 1,10,Dame.,0.439542,0.320707,0.490196,0.332071,,,,,
114
+ 1,10,The,0.496732,0.320707,0.527778,0.332071,,,,,
115
+ 1,10,Hyde,0.53268,0.320707,0.573529,0.334596,,,,,
116
+ 1,10,lab,0.580065,0.320707,0.602941,0.332071,,,,,
117
+ 1,10,is,0.607843,0.320707,0.620915,0.332071,,,,,
118
+ 1,10,interested,0.625817,0.320707,0.702614,0.332071,,,,,
119
+ 1,10,in,0.707516,0.320707,0.722222,0.332071,,,,,
120
+ 1,10,the,0.727124,0.320707,0.751634,0.332071,,,,,
121
+ 1,10,signals,0.756536,0.320707,0.810458,0.334596,,,,,
122
+ 1,10,that,0.815359,0.320707,0.844771,0.332071,,,,,
123
+ 1,11,initiate,0.147059,0.338384,0.20098,0.349747,,,,,
124
+ 1,11,Muller,0.205882,0.338384,0.259804,0.349747,,,,,
125
+ 1,11,glia,0.264706,0.338384,0.292484,0.352273,,,,,
126
+ 1,11,division,0.297386,0.338384,0.361111,0.349747,,,,,
127
+ 1,11,post-light,0.366013,0.338384,0.44281,0.352273,,,,,
128
+ 1,11,damage.,0.446078,0.338384,0.511438,0.352273,,,,,
129
+ 1,11,My,0.51634,0.338384,0.544118,0.352273,,,,,
130
+ 1,11,first,0.54902,0.338384,0.581699,0.349747,,,,,
131
+ 1,11,research,0.584967,0.338384,0.650327,0.349747,,,,,
132
+ 1,11,project,0.655229,0.338384,0.710784,0.353535,,,,,
133
+ 1,11,was,0.715686,0.340909,0.745098,0.349747,,,,,
134
+ 1,12,characterizing,0.147059,0.354798,0.256536,0.369949,,,,,
135
+ 1,12,the,0.261438,0.356061,0.285948,0.367424,,,,,
136
+ 1,12,role,0.29085,0.356061,0.321895,0.367424,,,,,
137
+ 1,12,of,0.326797,0.356061,0.344771,0.367424,,,,,
138
+ 1,12,leukemia,0.348039,0.356061,0.419935,0.367424,,,,,
139
+ 1,12,inhibitory,0.424837,0.354798,0.501634,0.369949,,,,,
140
+ 1,12,factor,0.506536,0.356061,0.553922,0.367424,,,,,
141
+ 1,12,(LIF),0.55719,0.354798,0.599673,0.369949,,,,,
142
+ 1,12,in,0.604575,0.356061,0.620915,0.367424,,,,,
143
+ 1,12,the,0.624183,0.356061,0.648693,0.366162,,,,,
144
+ 1,12,activation,0.653595,0.356061,0.732026,0.367424,,,,,
145
+ 1,12,of,0.735294,0.354798,0.754902,0.367424,,,,,
146
+ 1,12,cell,0.756536,0.356061,0.785948,0.367424,,,,,
147
+ 1,13,proliferation,0.145425,0.372475,0.243464,0.387626,,,,,
148
+ 1,13,in,0.25,0.373737,0.264706,0.383838,,,,,
149
+ 1,13,the,0.269608,0.373737,0.292484,0.383838,,,,,
150
+ 1,13,undamaged,0.297386,0.372475,0.388889,0.387626,,,,,
151
+ 1,13,zebrafish,0.393791,0.372475,0.465686,0.383838,,,,,
152
+ 1,13,retina.,0.470588,0.373737,0.519608,0.383838,,,,,
153
+ 1,13,I,0.52451,0.373737,0.531046,0.383838,,,,,
154
+ 1,13,am,0.535948,0.376263,0.560458,0.383838,,,,,
155
+ 1,13,also,0.565359,0.372475,0.596405,0.383838,,,,,
156
+ 1,13,working,0.601307,0.372475,0.666667,0.387626,,,,,
157
+ 1,13,on,0.671569,0.376263,0.691176,0.385101,,,,,
158
+ 1,13,several,0.696078,0.373737,0.751634,0.383838,,,,,
159
+ 1,14,experiments,0.145425,0.390152,0.24183,0.405303,,,,,
160
+ 1,14,that,0.246732,0.390152,0.276144,0.401515,,,,,
161
+ 1,14,are,0.281046,0.393939,0.305556,0.401515,,,,,
162
+ 1,14,related,0.308824,0.390152,0.362745,0.401515,,,,,
163
+ 1,14,to,0.367647,0.392677,0.383987,0.401515,,,,,
164
+ 1,14,a,0.388889,0.393939,0.397059,0.401515,,,,,
165
+ 1,14,genetic,0.401961,0.390152,0.45915,0.405303,,,,,
166
+ 1,14,screen,0.464052,0.393939,0.514706,0.401515,,,,,
167
+ 1,14,that,0.517974,0.390152,0.547386,0.401515,,,,,
168
+ 1,14,the,0.552288,0.390152,0.576797,0.401515,,,,,
169
+ 1,14,Hyde,0.581699,0.390152,0.624183,0.405303,,,,,
170
+ 1,14,lab,0.629085,0.390152,0.653595,0.401515,,,,,
171
+ 1,14,plans,0.658497,0.390152,0.699346,0.405303,,,,,
172
+ 1,14,on,0.704248,0.393939,0.723856,0.401515,,,,,
173
+ 1,14,performing,0.728758,0.390152,0.816993,0.405303,,,,,
174
+ 1,14,to,0.821895,0.391414,0.836601,0.401515,,,,,
175
+ 1,15,identify,0.147059,0.407828,0.207516,0.421717,,,,,
176
+ 1,15,mutants,0.212418,0.409091,0.272876,0.419192,,,,,
177
+ 1,15,in,0.279412,0.407828,0.294118,0.419192,,,,,
178
+ 1,15,the,0.29902,0.407828,0.323529,0.419192,,,,,
179
+ 1,15,regeneration,0.328431,0.407828,0.426471,0.42298,,,,,
180
+ 1,15,pathway--I,0.429739,0.407828,0.51634,0.42298,,,,,
181
+ 1,15,am,0.522876,0.411616,0.545752,0.419192,,,,,
182
+ 1,15,developing,0.550654,0.407828,0.638889,0.42298,,,,,
183
+ 1,15,a,0.643791,0.411616,0.651961,0.419192,,,,,
184
+ 1,15,neuroD4:EGFP,0.656863,0.407828,0.78268,0.419192,,,,,
185
+ 1,16,transgenic,0.145425,0.425505,0.227124,0.439394,,,,,
186
+ 1,16,line,0.232026,0.425505,0.261438,0.436869,,,,,
187
+ 1,16,for,0.26634,0.425505,0.289216,0.436869,,,,,
188
+ 1,16,use,0.294118,0.42803,0.320261,0.436869,,,,,
189
+ 1,16,in,0.325163,0.425505,0.339869,0.436869,,,,,
190
+ 1,16,this,0.344771,0.425505,0.372549,0.436869,,,,,
191
+ 1,16,screen,0.377451,0.42803,0.428105,0.436869,,,,,
192
+ 1,16,and,0.433007,0.425505,0.460784,0.436869,,,,,
193
+ 1,16,I,0.46732,0.425505,0.472222,0.436869,,,,,
194
+ 1,16,am,0.477124,0.42803,0.501634,0.436869,,,,,
195
+ 1,16,characterizing,0.506536,0.425505,0.617647,0.439394,,,,,
196
+ 1,16,the,0.622549,0.425505,0.647059,0.436869,,,,,
197
+ 1,16,extent,0.651961,0.426768,0.70098,0.436869,,,,,
198
+ 1,16,of,0.704248,0.425505,0.722222,0.436869,,,,,
199
+ 1,16,damage,0.72549,0.425505,0.787582,0.439394,,,,,
200
+ 1,16,and,0.79085,0.425505,0.820261,0.436869,,,,,
201
+ 1,17,regeneration,0.145425,0.443182,0.243464,0.457071,,,,,
202
+ 1,17,in,0.25,0.443182,0.264706,0.454545,,,,,
203
+ 1,17,sheer,0.267974,0.443182,0.312092,0.454545,,,,,
204
+ 1,17,zebrafish,0.316993,0.443182,0.388889,0.454545,,,,,
205
+ 1,17,retinas.,0.393791,0.443182,0.449346,0.454545,,,,,
206
+ 1,17,"Finally,",0.455882,0.443182,0.51634,0.457071,,,,,
207
+ 1,17,I,0.521242,0.443182,0.527778,0.454545,,,,,
208
+ 1,17,am,0.53268,0.445707,0.55719,0.454545,,,,,
209
+ 1,17,characterizing,0.560458,0.443182,0.671569,0.457071,,,,,
210
+ 1,17,the,0.676471,0.443182,0.70098,0.454545,,,,,
211
+ 1,17,chx10:EGFP,0.705882,0.443182,0.808824,0.454545,,,,,
212
+ 1,18,transgenic,0.145425,0.459596,0.227124,0.474747,,,,,
213
+ 1,18,line,0.232026,0.459596,0.261438,0.47096,,,,,
214
+ 1,18,during,0.26634,0.459596,0.316993,0.474747,,,,,
215
+ 1,18,retinal,0.321895,0.459596,0.372549,0.47096,,,,,
216
+ 1,18,development,0.377451,0.459596,0.478758,0.474747,,,,,
217
+ 1,18,and,0.48366,0.460859,0.511438,0.47096,,,,,
218
+ 1,18,regeneration.,0.51634,0.459596,0.619281,0.474747,,,,,
219
+ 1,19,Please,0.145425,0.496212,0.196078,0.507576,,,,,
220
+ 1,19,find,0.20098,0.496212,0.232026,0.507576,,,,,
221
+ 1,19,my,0.236928,0.5,0.263072,0.510101,,,,,
222
+ 1,19,CV,0.267974,0.496212,0.295752,0.507576,,,,,
223
+ 1,19,attached.,0.29902,0.496212,0.369281,0.507576,,,,,
224
+ 1,20,Thank,0.145425,0.531566,0.196078,0.542929,,,,,
225
+ 1,20,you,0.20098,0.535354,0.230392,0.546717,,,,,
226
+ 1,20,for,0.235294,0.531566,0.25817,0.542929,,,,,
227
+ 1,20,your,0.263072,0.535354,0.29902,0.546717,,,,,
228
+ 1,20,"time,",0.303922,0.531566,0.343137,0.545455,,,,,
229
+ 1,21,--Lauren,0.147059,0.568182,0.215686,0.579545,,,,,
230
+ 1,21,Lilley,0.218954,0.566919,0.26634,0.582071,,,,,
231
+ 1,22,Dr.,0.145425,0.637626,0.171569,0.64899,,,,,
232
+ 1,22,"Poss,",0.176471,0.637626,0.21732,0.651515,,,,,
233
+ 1,23,I,0.145425,0.671717,0.151961,0.683081,,,,,
234
+ 1,23,am,0.158497,0.675505,0.181373,0.684343,,,,,
235
+ 1,23,a,0.186275,0.675505,0.194444,0.684343,,,,,
236
+ 1,23,senior,0.199346,0.671717,0.248366,0.683081,,,,,
237
+ 1,23,biology,0.253268,0.671717,0.312092,0.686869,,,,,
238
+ 1,23,major,0.316993,0.671717,0.364379,0.686869,,,,,
239
+ 1,23,at,0.369281,0.674242,0.382353,0.683081,,,,,
240
+ 1,23,the,0.387255,0.671717,0.411765,0.684343,,,,,
241
+ 1,23,University,0.416667,0.671717,0.498366,0.686869,,,,,
242
+ 1,23,of,0.504902,0.671717,0.522876,0.683081,,,,,
243
+ 1,23,Notre,0.52451,0.671717,0.570261,0.684343,,,,,
244
+ 1,23,Dame.,0.575163,0.671717,0.625817,0.684343,,,,,
245
+ 1,23,I,0.630719,0.671717,0.637255,0.683081,,,,,
246
+ 1,23,am,0.643791,0.675505,0.666667,0.684343,,,,,
247
+ 1,23,applying,0.671569,0.67298,0.740196,0.686869,,,,,
248
+ 1,23,to,0.745098,0.67298,0.759804,0.683081,,,,,
249
+ 1,23,your,0.764706,0.675505,0.802288,0.686869,,,,,
250
+ 1,24,graduate,0.145425,0.689394,0.214052,0.704545,,,,,
251
+ 1,24,program,0.218954,0.693182,0.284314,0.703283,,,,,
252
+ 1,24,and,0.289216,0.689394,0.318627,0.700758,,,,,
253
+ 1,24,am,0.323529,0.693182,0.348039,0.700758,,,,,
254
+ 1,24,very,0.351307,0.693182,0.387255,0.703283,,,,,
255
+ 1,24,interested,0.392157,0.689394,0.46732,0.700758,,,,,
256
+ 1,24,in,0.473856,0.689394,0.488562,0.700758,,,,,
257
+ 1,24,your,0.493464,0.693182,0.529412,0.703283,,,,,
258
+ 1,24,work.,0.534314,0.689394,0.578431,0.700758,,,,,
259
+ 1,24,After,0.583333,0.689394,0.625817,0.700758,,,,,
260
+ 1,24,glancing,0.630719,0.689394,0.697712,0.703283,,,,,
261
+ 1,24,at,0.702614,0.690657,0.71732,0.700758,,,,,
262
+ 1,24,a,0.722222,0.693182,0.730392,0.700758,,,,,
263
+ 1,24,few,0.735294,0.689394,0.764706,0.700758,,,,,
264
+ 1,24,of,0.769608,0.689394,0.787582,0.700758,,,,,
265
+ 1,24,your,0.79085,0.693182,0.826797,0.703283,,,,,
266
+ 1,25,recent,0.145425,0.708333,0.194444,0.718434,,,,,
267
+ 1,25,papers,0.199346,0.710859,0.25,0.72096,,,,,
268
+ 1,25,and,0.254902,0.707071,0.28268,0.718434,,,,,
269
+ 1,25,your,0.287582,0.710859,0.325163,0.72096,,,,,
270
+ 1,25,research,0.328431,0.707071,0.393791,0.718434,,,,,
271
+ 1,25,summary,0.398693,0.709596,0.472222,0.72096,,,,,
272
+ 1,25,I,0.477124,0.707071,0.48366,0.718434,,,,,
273
+ 1,25,find,0.488562,0.707071,0.519608,0.718434,,,,,
274
+ 1,25,your,0.52451,0.710859,0.562092,0.72096,,,,,
275
+ 1,25,research,0.565359,0.707071,0.632353,0.718434,,,,,
276
+ 1,25,greatly,0.637255,0.707071,0.691176,0.72096,,,,,
277
+ 1,25,coincides,0.696078,0.707071,0.769608,0.718434,,,,,
278
+ 1,25,with,0.77451,0.707071,0.810458,0.718434,,,,,
279
+ 1,25,my,0.813725,0.710859,0.839869,0.72096,,,,,
280
+ 1,26,research,0.145425,0.724747,0.210784,0.736111,,,,,
281
+ 1,26,experiences,0.21732,0.724747,0.308824,0.738636,,,,,
282
+ 1,26,and,0.313725,0.723485,0.341503,0.736111,,,,,
283
+ 1,26,interests.,0.346405,0.723485,0.416667,0.736111,,,,,
284
+ 1,26,Will,0.426471,0.723485,0.462418,0.736111,,,,,
285
+ 1,26,you,0.465686,0.727273,0.496732,0.738636,,,,,
286
+ 1,26,be,0.5,0.723485,0.519608,0.736111,,,,,
287
+ 1,26,taking,0.52451,0.724747,0.573529,0.738636,,,,,
288
+ 1,26,on,0.578431,0.727273,0.598039,0.736111,,,,,
289
+ 1,26,new,0.602941,0.727273,0.635621,0.736111,,,,,
290
+ 1,26,students,0.640523,0.724747,0.704248,0.736111,,,,,
291
+ 1,26,next,0.70915,0.72601,0.745098,0.734848,,,,,
292
+ 1,26,year?,0.748366,0.724747,0.79085,0.738636,,,,,
293
+ 1,27,I,0.145425,0.760101,0.151961,0.771465,,,,,
294
+ 1,27,have,0.156863,0.760101,0.194444,0.771465,,,,,
295
+ 1,27,worked,0.199346,0.760101,0.25817,0.771465,,,,,
296
+ 1,27,on,0.263072,0.763889,0.28268,0.771465,,,,,
297
+ 1,27,several,0.287582,0.760101,0.343137,0.771465,,,,,
298
+ 1,27,different,0.348039,0.760101,0.416667,0.771465,,,,,
299
+ 1,27,research,0.419935,0.760101,0.485294,0.771465,,,,,
300
+ 1,27,projects,0.490196,0.760101,0.552288,0.775253,,,,,
301
+ 1,27,as,0.55719,0.763889,0.573529,0.771465,,,,,
302
+ 1,27,an,0.578431,0.763889,0.598039,0.771465,,,,,
303
+ 1,27,undergraduate,0.602941,0.760101,0.714052,0.775253,,,,,
304
+ 1,27,in,0.718954,0.760101,0.735294,0.771465,,,,,
305
+ 1,27,Dr.,0.740196,0.760101,0.764706,0.771465,,,,,
306
+ 1,27,David,0.769608,0.760101,0.818627,0.771465,,,,,
307
+ 1,27,R.,0.823529,0.760101,0.839869,0.771465,,,,,
308
+ 1,28,Hyde's,0.145425,0.777778,0.199346,0.791667,,,,,
309
+ 1,28,lab,0.204248,0.777778,0.228758,0.789141,,,,,
310
+ 1,28,at,0.23366,0.77904,0.248366,0.789141,,,,,
311
+ 1,28,the,0.251634,0.777778,0.276144,0.789141,,,,,
312
+ 1,28,University,0.281046,0.777778,0.364379,0.791667,,,,,
313
+ 1,28,of,0.369281,0.777778,0.387255,0.789141,,,,,
314
+ 1,28,Notre,0.390523,0.777778,0.434641,0.789141,,,,,
315
+ 1,28,Dame.,0.439542,0.777778,0.490196,0.789141,,,,,
316
+ 1,28,The,0.496732,0.777778,0.527778,0.789141,,,,,
317
+ 1,28,Hyde,0.53268,0.777778,0.573529,0.791667,,,,,
318
+ 1,28,lab,0.580065,0.777778,0.602941,0.789141,,,,,
319
+ 1,28,is,0.607843,0.777778,0.620915,0.789141,,,,,
320
+ 1,28,interested,0.625817,0.777778,0.702614,0.789141,,,,,
321
+ 1,28,in,0.707516,0.777778,0.722222,0.789141,,,,,
322
+ 1,28,the,0.727124,0.777778,0.751634,0.789141,,,,,
323
+ 1,28,signals,0.756536,0.777778,0.810458,0.791667,,,,,
324
+ 1,28,that,0.815359,0.777778,0.846405,0.789141,,,,,
325
+ 1,29,initiate,0.145425,0.795455,0.20098,0.806818,,,,,
326
+ 1,29,Muller,0.205882,0.795455,0.259804,0.806818,,,,,
327
+ 1,29,glia,0.264706,0.795455,0.292484,0.809343,,,,,
328
+ 1,29,division,0.297386,0.795455,0.361111,0.806818,,,,,
329
+ 1,29,post-light,0.366013,0.795455,0.44281,0.809343,,,,,
330
+ 1,29,damage.,0.446078,0.795455,0.511438,0.809343,,,,,
331
+ 1,29,My,0.51634,0.795455,0.544118,0.809343,,,,,
332
+ 1,29,first,0.54902,0.795455,0.581699,0.806818,,,,,
333
+ 1,29,research,0.584967,0.795455,0.651961,0.806818,,,,,
334
+ 1,29,project,0.655229,0.795455,0.710784,0.809343,,,,,
335
+ 1,29,was,0.715686,0.799242,0.745098,0.806818,,,,,
336
+ 1,30,characterizing,0.145425,0.811869,0.25817,0.82702,,,,,
337
+ 1,30,the,0.261438,0.811869,0.285948,0.823232,,,,,
338
+ 1,30,role,0.29085,0.813131,0.321895,0.823232,,,,,
339
+ 1,30,of,0.326797,0.811869,0.344771,0.824495,,,,,
340
+ 1,30,leukemia,0.348039,0.811869,0.419935,0.823232,,,,,
341
+ 1,30,inhibitory,0.424837,0.811869,0.501634,0.82702,,,,,
342
+ 1,30,factor,0.506536,0.811869,0.553922,0.823232,,,,,
343
+ 1,30,(LIF),0.55719,0.813131,0.599673,0.82702,,,,,
344
+ 1,30,in,0.604575,0.811869,0.620915,0.824495,,,,,
345
+ 1,30,the,0.624183,0.811869,0.648693,0.824495,,,,,
346
+ 1,30,activation,0.653595,0.813131,0.732026,0.824495,,,,,
347
+ 1,30,of,0.735294,0.811869,0.754902,0.824495,,,,,
348
+ 1,30,cell,0.756536,0.811869,0.785948,0.824495,,,,,
349
+ 1,31,proliferation,0.145425,0.829545,0.245098,0.844697,,,,,
350
+ 1,31,in,0.25,0.829545,0.264706,0.840909,,,,,
351
+ 1,31,the,0.267974,0.829545,0.292484,0.840909,,,,,
352
+ 1,31,undamaged,0.297386,0.830808,0.388889,0.844697,,,,,
353
+ 1,31,zebrafish,0.393791,0.829545,0.465686,0.842172,,,,,
354
+ 1,31,retina.,0.470588,0.830808,0.519608,0.842172,,,,,
355
+ 1,31,I,0.52451,0.830808,0.531046,0.840909,,,,,
356
+ 1,31,am,0.535948,0.833333,0.560458,0.842172,,,,,
357
+ 1,31,also,0.565359,0.829545,0.596405,0.840909,,,,,
358
+ 1,31,working,0.601307,0.830808,0.666667,0.844697,,,,,
359
+ 1,31,on,0.671569,0.833333,0.691176,0.840909,,,,,
360
+ 1,31,several,0.696078,0.829545,0.751634,0.840909,,,,,
361
+ 1,32,experiments,0.145425,0.847222,0.24183,0.862374,,,,,
362
+ 1,32,that,0.246732,0.847222,0.276144,0.858586,,,,,
363
+ 1,32,are,0.281046,0.85101,0.305556,0.858586,,,,,
364
+ 1,32,related,0.308824,0.847222,0.362745,0.858586,,,,,
365
+ 1,32,to,0.367647,0.848485,0.383987,0.858586,,,,,
366
+ 1,32,a,0.388889,0.85101,0.397059,0.858586,,,,,
367
+ 1,32,genetic,0.401961,0.847222,0.45915,0.861111,,,,,
368
+ 1,32,screen,0.464052,0.85101,0.514706,0.858586,,,,,
369
+ 1,32,that,0.517974,0.847222,0.54902,0.858586,,,,,
370
+ 1,32,the,0.552288,0.847222,0.576797,0.858586,,,,,
371
+ 1,32,Hyde,0.581699,0.847222,0.624183,0.861111,,,,,
372
+ 1,32,lab,0.629085,0.847222,0.653595,0.858586,,,,,
373
+ 1,32,plans,0.656863,0.847222,0.699346,0.861111,,,,,
374
+ 1,32,on,0.704248,0.85101,0.723856,0.858586,,,,,
375
+ 1,32,performing,0.728758,0.847222,0.816993,0.862374,,,,,
376
+ 1,32,to,0.821895,0.848485,0.836601,0.858586,,,,,
377
+ 1,33,identify,0.145425,0.864899,0.207516,0.878788,,,,,
378
+ 1,33,mutants,0.212418,0.866162,0.272876,0.876263,,,,,
379
+ 1,33,in,0.279412,0.864899,0.294118,0.876263,,,,,
380
+ 1,33,the,0.29902,0.864899,0.323529,0.876263,,,,,
381
+ 1,33,regeneration,0.328431,0.864899,0.426471,0.878788,,,,,
382
+ 1,33,pathway--I,0.431373,0.864899,0.51634,0.878788,,,,,
383
+ 1,33,am,0.522876,0.868687,0.545752,0.876263,,,,,
384
+ 1,33,developing,0.550654,0.864899,0.638889,0.878788,,,,,
385
+ 1,33,a,0.643791,0.868687,0.651961,0.876263,,,,,
386
+ 1,33,neuroD4:EGFP,0.655229,0.864899,0.78268,0.876263,,,,,
387
+ 1,34,transgenic,0.145425,0.882576,0.227124,0.896465,,,,,
388
+ 1,34,line,0.232026,0.882576,0.261438,0.893939,,,,,
389
+ 1,34,for,0.26634,0.881313,0.289216,0.893939,,,,,
390
+ 1,34,use,0.294118,0.885101,0.320261,0.893939,,,,,
391
+ 1,34,in,0.325163,0.882576,0.339869,0.893939,,,,,
392
+ 1,34,this,0.344771,0.882576,0.372549,0.893939,,,,,
393
+ 1,34,screen,0.379085,0.885101,0.428105,0.893939,,,,,
394
+ 1,34,and,0.433007,0.882576,0.460784,0.893939,,,,,
395
+ 1,34,I,0.46732,0.882576,0.472222,0.893939,,,,,
396
+ 1,34,am,0.478758,0.885101,0.501634,0.893939,,,,,
397
+ 1,34,characterizing,0.506536,0.882576,0.617647,0.896465,,,,,
398
+ 1,34,the,0.622549,0.882576,0.647059,0.893939,,,,,
399
+ 1,34,extent,0.651961,0.883838,0.699346,0.892677,,,,,
400
+ 1,34,of,0.704248,0.882576,0.722222,0.893939,,,,,
401
+ 1,34,damage,0.72549,0.882576,0.785948,0.896465,,,,,
402
+ 1,34,and,0.79085,0.882576,0.820261,0.893939,,,,,
403
+ 2,1,regeneration,0.145425,0.093434,0.243464,0.107323,,,,,
404
+ 2,1,in,0.248366,0.093434,0.264706,0.104798,,,,,
405
+ 2,1,sheer,0.267974,0.093434,0.312092,0.104798,,,,,
406
+ 2,1,zebrafish,0.316993,0.093434,0.387255,0.104798,,,,,
407
+ 2,1,retinas.,0.392157,0.093434,0.449346,0.104798,,,,,
408
+ 2,1,"Finally,",0.455882,0.093434,0.514706,0.107323,,,,,
409
+ 2,1,I,0.521242,0.093434,0.527778,0.104798,,,,,
410
+ 2,1,am,0.53268,0.097222,0.555556,0.104798,,,,,
411
+ 2,1,characterizing,0.560458,0.093434,0.671569,0.107323,,,,,
412
+ 2,1,the,0.676471,0.093434,0.70098,0.104798,,,,,
413
+ 2,1,chx10:EGFP,0.705882,0.093434,0.808824,0.104798,,,,,
414
+ 2,2,transgenic,0.145425,0.111111,0.227124,0.125,,,,,
415
+ 2,2,line,0.232026,0.111111,0.261438,0.122475,,,,,
416
+ 2,2,during,0.26634,0.111111,0.316993,0.125,,,,,
417
+ 2,2,retinal,0.321895,0.111111,0.372549,0.122475,,,,,
418
+ 2,2,development,0.377451,0.111111,0.478758,0.125,,,,,
419
+ 2,2,and,0.48366,0.111111,0.511438,0.122475,,,,,
420
+ 2,2,regeneration.,0.51634,0.111111,0.617647,0.125,,,,,
421
+ 2,3,Please,0.145425,0.146465,0.196078,0.157828,,,,,
422
+ 2,3,find,0.20098,0.146465,0.232026,0.157828,,,,,
423
+ 2,3,my,0.236928,0.150253,0.263072,0.160354,,,,,
424
+ 2,3,CV,0.267974,0.146465,0.295752,0.157828,,,,,
425
+ 2,3,attached.,0.29902,0.146465,0.369281,0.157828,,,,,
426
+ 2,4,Thank,0.145425,0.183081,0.196078,0.193182,,,,,
427
+ 2,4,you,0.20098,0.185606,0.230392,0.19697,,,,,
428
+ 2,4,for,0.235294,0.181818,0.25817,0.193182,,,,,
429
+ 2,4,your,0.263072,0.185606,0.29902,0.19697,,,,,
430
+ 2,4,"time,",0.303922,0.181818,0.343137,0.195707,,,,,
431
+ 2,5,--Lauren,0.147059,0.218434,0.215686,0.229798,,,,,
432
+ 2,5,Lilley,0.218954,0.218434,0.26634,0.232323,,,,,
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image,page,label,color,xmin,ymin,xmax,ymax,id,text
2
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.147059,0.162879,0.171569,0.174242,oJIosRHGyCRn,Dr
3
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.147059,0.162879,0.261438,0.176768,5C5tA6mfeL7T,Dr Kornbluth
4
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.162879,0.261438,0.176768,UoYN48bc2ry5,Kornbluth
5
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.30303,0.764706,0.314394,cAsjVETPEisV,Dr
6
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.30303,0.839869,0.314394,yQ5HKn4tfT7L,Dr David R.
7
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.30303,0.839869,0.314394,LR8phiOYnLWi,David R.
8
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.218954,0.566919,0.26634,0.582071,X8iObIauqZ9k,Lauren Lilley
9
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.145425,0.637626,0.171569,0.64899,SvWjK2F7R3un,Dr
10
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.145425,0.637626,0.21732,0.651515,zKJFVAOszwdM,Dr Poss
11
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.637626,0.21732,0.651515,Iqda7ixkzcmg,Poss
12
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.760101,0.764706,0.771465,TWQD93bGI3B3,Dr
13
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.760101,0.839869,0.771465,vQuQQwqWjSES,Dr David R.
14
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.760101,0.839869,0.771465,f8xf6ORJUSnG,David R.
15
+ placeholder_image_1.png,2,NAME,"(0, 0, 0)",0.218954,0.218434,0.26634,0.232323,N0nje9UiCzZK,Lauren Lilley
example_data/graduate-job-example-cover-letter.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71cc851d41f80dd8b045af32657b76bf85dd8f72d39ae08fa43dc7a78256fe35
3
+ size 77045
example_data/partnership_toolkit_redact_custom_deny_list.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Friendship City
2
+ United States
example_data/partnership_toolkit_redact_some_pages.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2
2
+ 5