seanpedrickcase commited on
Commit
45a02da
·
0 Parent(s):

Sync: Updated CDK deployment with options for using ECS Express mode, direct run mode, and agent route

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .coveragerc +56 -0
  2. .dockerignore +52 -0
  3. .gitattributes +9 -0
  4. .github/scripts/setup_test_data.py +320 -0
  5. .github/workflow_README.md +183 -0
  6. .github/workflows/archive_workflows/multi-os-test.yml +115 -0
  7. .github/workflows/ci.yml +269 -0
  8. .github/workflows/simple-test.yml +74 -0
  9. .github/workflows/sync-pi-agent-space.yml +64 -0
  10. .github/workflows/sync_to_hf.yml +54 -0
  11. .github/workflows/sync_to_hf_zero_gpu.yml +59 -0
  12. .gitignore +62 -0
  13. AGENTS.md +113 -0
  14. Dockerfile +232 -0
  15. Dockerfile.pi +40 -0
  16. MANIFEST.in +4 -0
  17. README.md +344 -0
  18. README_PYPI.md +328 -0
  19. agent-redact/README.md +25 -0
  20. agent-redact/pi-agent/.dockerignore +10 -0
  21. agent-redact/pi-agent/.gitattributes +2 -0
  22. agent-redact/pi-agent/Dockerfile +70 -0
  23. agent-redact/pi-agent/README.md +45 -0
  24. agent-redact/pi-agent/sync-manifest.txt +10 -0
  25. agent-redact/pi-agent/sync_to_space.sh +42 -0
  26. agent-redact/pi/agent/README.md +183 -0
  27. agent-redact/pi/agent/models.json +31 -0
  28. agent-redact/pi/agent/settings.json +32 -0
  29. agent-redact/pi/bootstrap_pi_config.py +151 -0
  30. agent-redact/pi/gradio_app.py +1769 -0
  31. agent-redact/pi/output_files.py +316 -0
  32. agent-redact/pi/pi_agent_config.py +715 -0
  33. agent-redact/pi/pi_examples.py +180 -0
  34. agent-redact/pi/pi_rpc_client.py +649 -0
  35. agent-redact/pi/pi_session_usage.py +185 -0
  36. agent-redact/pi/pi_workspace_skills.py +182 -0
  37. agent-redact/pi/redaction_prompt.py +556 -0
  38. agent-redact/pi/remote_redaction.py +104 -0
  39. agent-redact/pi/session_logs.py +119 -0
  40. agent-redact/pi/session_workspace.py +192 -0
  41. agent-redact/pi/start.sh +26 -0
  42. agent-redact/requirements_pi_agent.txt +35 -0
  43. agent_routes.py +1167 -0
  44. app.py +0 -0
  45. cdk/__init__.py +0 -0
  46. cdk/app.py +119 -0
  47. cdk/cdk.json.example +7 -0
  48. cdk/cdk_appregistry.py +69 -0
  49. cdk/cdk_config.py +590 -0
  50. cdk/cdk_functions.py +2448 -0
.coveragerc ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ source = .
3
+ omit =
4
+ */tests/*
5
+ */test/*
6
+ */__pycache__/*
7
+ */venv/*
8
+ */env/*
9
+ */build/*
10
+ */dist/*
11
+ */cdk/*
12
+ */docs/*
13
+ */example_data/*
14
+ */examples/*
15
+ */feedback/*
16
+ */logs/*
17
+ */old_code/*
18
+ */output/*
19
+ */tmp/*
20
+ */usage/*
21
+ */tld/*
22
+ */tesseract/*
23
+ */poppler/*
24
+ config*.py
25
+ setup.py
26
+ lambda_entrypoint.py
27
+ entrypoint.sh
28
+ cli_redact.py
29
+ load_dynamo_logs.py
30
+ load_s3_logs.py
31
+ *.spec
32
+ Dockerfile
33
+ *.qmd
34
+ *.md
35
+ *.txt
36
+ *.yml
37
+ *.yaml
38
+ *.json
39
+ *.csv
40
+ *.env
41
+ *.bat
42
+ *.ps1
43
+ *.sh
44
+
45
+ [report]
46
+ exclude_lines =
47
+ pragma: no cover
48
+ def __repr__
49
+ if self.debug:
50
+ if settings.DEBUG
51
+ raise AssertionError
52
+ raise NotImplementedError
53
+ if 0:
54
+ if __name__ == .__main__.:
55
+ class .*\bProtocol\):
56
+ @(abc\.)?abstractmethod
.dockerignore ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ *.qmd
5
+ _quarto.yml
6
+ quarto_site/*
7
+ src/*
8
+ redaction_deps/*
9
+ .venv/*
10
+ examples/*
11
+ processing/*
12
+ tools/__pycache__/*
13
+ old_code/*
14
+ tesseract/*
15
+ poppler/*
16
+ build/*
17
+ dist/*
18
+ docs/*
19
+ .pi/*
20
+ build_deps/*
21
+ user_guide/*
22
+ _extensions/*
23
+ workspace/*
24
+ doc_redaction.egg-info/*
25
+ .venv_pypi_test/*
26
+ cdk/config/*
27
+ tld/*
28
+ cdk/config/*
29
+ cdk/cdk.out/*
30
+ cdk/archive/*
31
+ cdk.json
32
+ cdk.context.json
33
+ .quarto/*
34
+ logs/
35
+ output/
36
+ input/
37
+ feedback/
38
+ config/
39
+ usage/
40
+ test/config/*
41
+ test/feedback/*
42
+ test/input/*
43
+ test/logs/*
44
+ test/output/*
45
+ test/tmp/*
46
+ test/usage/*
47
+ .ruff_cache/*
48
+ model_cache/*
49
+ sanitized_file/*
50
+ src/doc_redaction.egg-info/*
51
+ docker_compose/*
52
+ skills/example_prompts/*
.gitattributes ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.sh text eol=lf
3
+ *.jpg filter=lfs diff=lfs merge=lfs -text
4
+ *.xls filter=lfs diff=lfs merge=lfs -text
5
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
6
+ *.docx filter=lfs diff=lfs merge=lfs -text
7
+ *.doc filter=lfs diff=lfs merge=lfs -text
8
+ *.png filter=lfs diff=lfs merge=lfs -text
9
+ *.ico filter=lfs diff=lfs merge=lfs -text
.github/scripts/setup_test_data.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for GitHub Actions test data.
4
+ Creates dummy test files when example data is not available.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ import pandas as pd
11
+
12
+
13
+ def create_directories():
14
+ """Create necessary directories."""
15
+ dirs = ["doc_redaction/example_data", "doc_redaction/example_data/example_outputs"]
16
+
17
+ for dir_path in dirs:
18
+ os.makedirs(dir_path, exist_ok=True)
19
+ print(f"Created directory: {dir_path}")
20
+
21
+
22
+ def create_dummy_pdf():
23
+ """Create dummy PDFs for testing."""
24
+
25
+ # Install reportlab if not available
26
+ try:
27
+ from reportlab.lib.pagesizes import letter
28
+ from reportlab.pdfgen import canvas
29
+ except ImportError:
30
+ import subprocess
31
+
32
+ subprocess.check_call(["pip", "install", "reportlab"])
33
+ from reportlab.lib.pagesizes import letter
34
+ from reportlab.pdfgen import canvas
35
+
36
+ try:
37
+ # Create the main test PDF
38
+ pdf_path = "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
39
+ print(f"Creating PDF: {pdf_path}")
40
+ print(f"Directory exists: {os.path.exists('doc_redaction/example_data')}")
41
+
42
+ c = canvas.Canvas(pdf_path, pagesize=letter)
43
+ c.drawString(100, 750, "This is a test document for redaction testing.")
44
+ c.drawString(100, 700, "Email: test@example.com")
45
+ c.drawString(100, 650, "Phone: 123-456-7890")
46
+ c.drawString(100, 600, "Name: John Doe")
47
+ c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
48
+ c.showPage()
49
+
50
+ # Add second page
51
+ c.drawString(100, 750, "Second page content")
52
+ c.drawString(100, 700, "More test data: jane.doe@example.com")
53
+ c.drawString(100, 650, "Another phone: 987-654-3210")
54
+ c.save()
55
+
56
+ print(f"Created dummy PDF: {pdf_path}")
57
+
58
+ # Create Partnership Agreement Toolkit PDF
59
+ partnership_pdf_path = (
60
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf"
61
+ )
62
+ print(f"Creating PDF: {partnership_pdf_path}")
63
+ c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
64
+ c.drawString(100, 750, "Partnership Agreement Toolkit")
65
+ c.drawString(100, 700, "This is a test partnership agreement document.")
66
+ c.drawString(100, 650, "Contact: partnership@example.com")
67
+ c.drawString(100, 600, "Phone: (555) 123-4567")
68
+ c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
69
+ c.showPage()
70
+
71
+ # Add second page
72
+ c.drawString(100, 750, "Page 2 - Partnership Details")
73
+ c.drawString(100, 700, "More partnership information here.")
74
+ c.drawString(100, 650, "Contact: info@partnership.org")
75
+ c.showPage()
76
+
77
+ # Add third page
78
+ c.drawString(100, 750, "Page 3 - Terms and Conditions")
79
+ c.drawString(100, 700, "Terms and conditions content.")
80
+ c.drawString(100, 650, "Legal contact: legal@partnership.org")
81
+ c.save()
82
+
83
+ print(f"Created dummy PDF: {partnership_pdf_path}")
84
+
85
+ # Create Graduate Job Cover Letter PDF
86
+ cover_letter_pdf_path = (
87
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf"
88
+ )
89
+ print(f"Creating PDF: {cover_letter_pdf_path}")
90
+ c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
91
+ c.drawString(100, 750, "Cover Letter Example")
92
+ c.drawString(100, 700, "Dear Hiring Manager,")
93
+ c.drawString(100, 650, "I am writing to apply for the position.")
94
+ c.drawString(100, 600, "Contact: applicant@example.com")
95
+ c.drawString(100, 550, "Phone: (555) 987-6543")
96
+ c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
97
+ c.drawString(100, 450, "Sincerely,")
98
+ c.drawString(100, 400, "John Applicant")
99
+ c.save()
100
+
101
+ print(f"Created dummy PDF: {cover_letter_pdf_path}")
102
+
103
+ except ImportError:
104
+ print("ReportLab not available, skipping PDF creation")
105
+ # Create simple text files instead
106
+ with open(
107
+ "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
108
+ "w",
109
+ ) as f:
110
+ f.write("This is a dummy PDF file for testing")
111
+
112
+ with open(
113
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
114
+ "w",
115
+ ) as f:
116
+ f.write("This is a dummy Partnership Agreement PDF file for testing")
117
+
118
+ with open(
119
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
120
+ "w",
121
+ ) as f:
122
+ f.write("This is a dummy cover letter PDF file for testing")
123
+
124
+ print("Created dummy text files instead of PDFs")
125
+
126
+
127
+ def create_dummy_csv():
128
+ """Create dummy CSV files for testing."""
129
+ # Main CSV
130
+ csv_data = {
131
+ "Case Note": [
132
+ "Client visited for consultation regarding housing issues",
133
+ "Follow-up appointment scheduled for next week",
134
+ "Documentation submitted for review",
135
+ ],
136
+ "Client": ["John Smith", "Jane Doe", "Bob Johnson"],
137
+ "Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
138
+ }
139
+ df = pd.DataFrame(csv_data)
140
+ df.to_csv("doc_redaction/example_data/combined_case_notes.csv", index=False)
141
+ print("Created dummy CSV: doc_redaction/example_data/combined_case_notes.csv")
142
+
143
+ # Lambeth CSV
144
+ lambeth_data = {
145
+ "text": [
146
+ "Lambeth 2030 vision document content",
147
+ "Our Future Our Lambeth strategic plan",
148
+ "Community engagement and development",
149
+ ],
150
+ "page": [1, 2, 3],
151
+ }
152
+ df_lambeth = pd.DataFrame(lambeth_data)
153
+ df_lambeth.to_csv(
154
+ "doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
155
+ index=False,
156
+ )
157
+ print(
158
+ "Created dummy CSV: doc_redaction/example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
159
+ )
160
+
161
+
162
+ def create_dummy_word_doc():
163
+ """Create dummy Word document."""
164
+ try:
165
+ from docx import Document
166
+
167
+ doc = Document()
168
+ doc.add_heading("Test Document for Redaction", 0)
169
+ doc.add_paragraph("This is a test document for redaction testing.")
170
+ doc.add_paragraph("Contact Information:")
171
+ doc.add_paragraph("Email: test@example.com")
172
+ doc.add_paragraph("Phone: 123-456-7890")
173
+ doc.add_paragraph("Name: John Doe")
174
+ doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
175
+
176
+ doc.save(
177
+ "doc_redaction/example_data/Bold minimalist professional cover letter.docx"
178
+ )
179
+ print("Created dummy Word document")
180
+
181
+ except ImportError:
182
+ print("python-docx not available, skipping Word document creation")
183
+
184
+
185
+ def create_allow_deny_lists():
186
+ """Create dummy allow/deny lists."""
187
+ # Allow lists
188
+ allow_data = {"word": ["test", "example", "document"]}
189
+ pd.DataFrame(allow_data).to_csv(
190
+ "doc_redaction/example_data/test_allow_list_graduate.csv", index=False
191
+ )
192
+ pd.DataFrame(allow_data).to_csv(
193
+ "doc_redaction/example_data/test_allow_list_partnership.csv", index=False
194
+ )
195
+ print("Created allow lists")
196
+
197
+ # Deny lists
198
+ deny_data = {"word": ["sensitive", "confidential", "private"]}
199
+ pd.DataFrame(deny_data).to_csv(
200
+ "doc_redaction/example_data/partnership_toolkit_redact_custom_deny_list.csv",
201
+ index=False,
202
+ )
203
+ pd.DataFrame(deny_data).to_csv(
204
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
205
+ index=False,
206
+ )
207
+ print("Created deny lists")
208
+
209
+ # Whole page redaction list
210
+ page_data = {"page": [1, 2]}
211
+ pd.DataFrame(page_data).to_csv(
212
+ "doc_redaction/example_data/partnership_toolkit_redact_some_pages.csv",
213
+ index=False,
214
+ )
215
+ print("Created whole page redaction list")
216
+
217
+
218
+ def create_ocr_output():
219
+ """Create dummy OCR output CSV."""
220
+ ocr_data = {
221
+ "page": [1, 2, 3],
222
+ "text": [
223
+ "This is page 1 content with some text",
224
+ "This is page 2 content with different text",
225
+ "This is page 3 content with more text",
226
+ ],
227
+ "left": [0.1, 0.3, 0.5],
228
+ "top": [0.95, 0.92, 0.88],
229
+ "width": [0.05, 0.02, 0.02],
230
+ "height": [0.01, 0.02, 0.02],
231
+ "line": [1, 2, 3],
232
+ }
233
+ df = pd.DataFrame(ocr_data)
234
+ df.to_csv(
235
+ "doc_redaction/example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
236
+ index=False,
237
+ )
238
+ print("Created dummy OCR output CSV")
239
+
240
+
241
+ def create_dummy_image():
242
+ """Create dummy image for testing."""
243
+ try:
244
+ from PIL import Image, ImageDraw, ImageFont
245
+
246
+ img = Image.new("RGB", (800, 600), color="white")
247
+ draw = ImageDraw.Draw(img)
248
+
249
+ # Try to use a system font
250
+ try:
251
+ font = ImageFont.truetype(
252
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
253
+ )
254
+ except Exception as e:
255
+ print(f"Error loading DejaVuSans font: {e}")
256
+ try:
257
+ font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
258
+ except Exception as e:
259
+ print(f"Error loading Arial font: {e}")
260
+ font = ImageFont.load_default()
261
+
262
+ # Add text to image
263
+ draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
264
+ draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
265
+ draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
266
+ draw.text((50, 200), "Name: John Doe", fill="black", font=font)
267
+ draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
268
+
269
+ img.save("doc_redaction/example_data/example_complaint_letter.jpg")
270
+ print("Created dummy image")
271
+
272
+ except ImportError:
273
+ print("PIL not available, skipping image creation")
274
+
275
+
276
+ def main():
277
+ """Main setup function."""
278
+ print("Setting up test data for GitHub Actions...")
279
+ print(f"Current working directory: {os.getcwd()}")
280
+ print(f"Python version: {sys.version}")
281
+
282
+ create_directories()
283
+ create_dummy_pdf()
284
+ create_dummy_csv()
285
+ create_dummy_word_doc()
286
+ create_allow_deny_lists()
287
+ create_ocr_output()
288
+ create_dummy_image()
289
+
290
+ print("\nTest data setup complete!")
291
+ print("Created files:")
292
+ for root, dirs, files in os.walk("doc_redaction/example_data"):
293
+ for file in files:
294
+ file_path = os.path.join(root, file)
295
+ print(f" {file_path}")
296
+ # Verify the file exists and has content
297
+ if os.path.exists(file_path):
298
+ file_size = os.path.getsize(file_path)
299
+ print(f" Size: {file_size} bytes")
300
+ else:
301
+ print(" WARNING: File does not exist!")
302
+
303
+ # Verify critical files exist
304
+ critical_files = [
305
+ "doc_redaction/example_data/Partnership-Agreement-Toolkit_0_0.pdf",
306
+ "doc_redaction/example_data/graduate-job-example-cover-letter.pdf",
307
+ "doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
308
+ ]
309
+
310
+ print("\nVerifying critical test files:")
311
+ for file_path in critical_files:
312
+ if os.path.exists(file_path):
313
+ file_size = os.path.getsize(file_path)
314
+ print(f"✅ {file_path} exists ({file_size} bytes)")
315
+ else:
316
+ print(f"❌ {file_path} MISSING!")
317
+
318
+
319
+ if __name__ == "__main__":
320
+ main()
.github/workflow_README.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GitHub Actions CI/CD Setup
2
+
3
+ This directory contains GitHub Actions workflows for automated testing of the CLI redaction application.
4
+
5
+ ## Workflows Overview
6
+
7
+ ### 1. **Simple Test Run** (`.github/workflows/simple-test.yml`)
8
+ - **Purpose**: Basic test execution
9
+ - **Triggers**: Push to main/dev, Pull requests
10
+ - **OS**: Ubuntu Latest
11
+ - **Python**: 3.11
12
+ - **Features**:
13
+ - Installs system dependencies
14
+ - Sets up test data
15
+ - Runs CLI tests
16
+ - Runs pytest
17
+
18
+ ### 2. **Comprehensive CI/CD** (`.github/workflows/ci.yml`)
19
+ - **Purpose**: Full CI/CD pipeline
20
+ - **Features**:
21
+ - Linting (Ruff, Black)
22
+ - Unit tests (Python 3.10, 3.11, 3.12)
23
+ - Integration tests
24
+ - Security scanning (Safety, Bandit)
25
+ - Coverage reporting
26
+ - Package building (on main branch)
27
+
28
+ ### 3. **Multi-OS Testing** (`.github/workflows/multi-os-test.yml`)
29
+ - **Purpose**: Cross-platform testing
30
+ - **OS**: Ubuntu, macOS (Windows not included currently but may be reintroduced)
31
+ - **Python**: 3.10, 3.11, 3.12
32
+ - **Features**: Tests compatibility across different operating systems
33
+
34
+ ### 4. **Basic Test Suite** (`.github/workflows/test.yml`)
35
+ - **Purpose**: Original test workflow
36
+ - **Features**:
37
+ - Multiple Python versions
38
+ - System dependency installation
39
+ - Test data creation
40
+ - Coverage reporting
41
+
42
+ ## Setup Scripts
43
+
44
+ ### Test Data Setup (`.github/scripts/setup_test_data.py`)
45
+ Creates dummy test files when example data is not available:
46
+ - PDF documents
47
+ - CSV files
48
+ - Word documents
49
+ - Images
50
+ - Allow/deny lists
51
+ - OCR output files
52
+
53
+ ## Usage
54
+
55
+ ### Running Tests Locally
56
+
57
+ ```bash
58
+ # Install dependencies
59
+ pip install -r requirements.txt
60
+ pip install pytest pytest-cov
61
+
62
+ # Setup test data
63
+ python .github/scripts/setup_test_data.py
64
+
65
+ # Run tests
66
+ cd test
67
+ python cli_epilog_suite.py
68
+ ```
69
+
70
+ ### GitHub Actions Triggers
71
+
72
+ 1. **Push to main/dev**: Runs all tests
73
+ 2. **Pull Request**: Runs tests and linting
74
+ 3. **Daily Schedule**: Runs tests at 2 AM UTC
75
+ 4. **Manual Trigger**: Can be triggered manually from GitHub
76
+
77
+ ## Configuration
78
+
79
+ ### Environment Variables
80
+ - `PYTHON_VERSION`: Default Python version (3.11)
81
+ - `PYTHONPATH`: Set automatically for test discovery
82
+
83
+ ### Caching
84
+ - Pip dependencies are cached for faster builds
85
+ - Cache key based on requirements.txt hash
86
+
87
+ ### Artifacts
88
+ - Test results (JUnit XML)
89
+ - Coverage reports (HTML, XML)
90
+ - Security reports
91
+ - Build artifacts (on main branch)
92
+
93
+ ## Test Data
94
+
95
+ The workflows automatically create test data when example files are missing:
96
+
97
+ ### Required Files Created:
98
+ - `example_data/example_of_emails_sent_to_a_professor_before_applying.pdf`
99
+ - `example_data/combined_case_notes.csv`
100
+ - `example_data/Bold minimalist professional cover letter.docx`
101
+ - `example_data/example_complaint_letter.jpg`
102
+ - `example_data/test_allow_list_*.csv`
103
+ - `example_data/partnership_toolkit_redact_*.csv`
104
+ - `example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv`
105
+
106
+ ### Dependencies Installed:
107
+ - **System**: tesseract-ocr, poppler-utils, OpenGL libraries
108
+ - **Python**: All requirements.txt packages + pytest, reportlab, pillow
109
+
110
+ ## Workflow Status
111
+
112
+ ### Success Criteria:
113
+ - ✅ All tests pass
114
+ - ✅ No linting errors
115
+ - ✅ Security checks pass
116
+ - ✅ Coverage meets threshold (if configured)
117
+
118
+ ### Failure Handling:
119
+ - Tests are designed to skip gracefully if files are missing
120
+ - AWS tests are expected to fail without credentials
121
+ - System dependency failures are handled with fallbacks
122
+
123
+ ## Customization
124
+
125
+ ### Adding New Tests:
126
+ 1. Add test methods to `test/cli_epilog_suite.py` or pytest files under `test/test_*.py`
127
+ 2. Update test data in `setup_test_data.py` if needed
128
+ 3. Tests will automatically run in all workflows
129
+
130
+ ### Modifying Workflows:
131
+ 1. Edit the appropriate `.yml` file
132
+ 2. Test locally first
133
+ 3. Push to trigger the workflow
134
+
135
+ ### Environment-Specific Settings:
136
+ - **Ubuntu**: Full system dependencies
137
+ - **Windows**: Python packages only
138
+ - **macOS**: Homebrew dependencies
139
+
140
+ ## Troubleshooting
141
+
142
+ ### Common Issues:
143
+
144
+ 1. **Missing Dependencies**:
145
+ - Check system dependency installation
146
+ - Verify Python package versions
147
+
148
+ 2. **Test Failures**:
149
+ - Check test data creation
150
+ - Verify file paths
151
+ - Review test output logs
152
+
153
+ 3. **AWS Test Failures**:
154
+ - Expected without credentials
155
+ - Tests are designed to handle this gracefully
156
+
157
+ 4. **System Dependency Issues**:
158
+ - Different OS have different requirements
159
+ - Check the specific OS section in workflows
160
+
161
+ ### Debug Mode:
162
+ Add `--verbose` or `-v` flags to pytest commands for more detailed output.
163
+
164
+ ## Security
165
+
166
+ - Dependencies are scanned with Safety
167
+ - Code is scanned with Bandit
168
+ - No secrets are exposed in logs
169
+ - Test data is temporary and cleaned up
170
+
171
+ ## Performance
172
+
173
+ - Tests run in parallel where possible
174
+ - Dependencies are cached
175
+ - Only necessary system packages are installed
176
+ - Test data is created efficiently
177
+
178
+ ## Monitoring
179
+
180
+ - Workflow status is visible in GitHub Actions tab
181
+ - Coverage reports are uploaded to Codecov
182
+ - Test results are available as artifacts
183
+ - Security reports are generated and stored
.github/workflows/archive_workflows/multi-os-test.yml ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Multi-OS Test
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ permissions:
10
+ contents: read
11
+ actions: read
12
+
13
+ jobs:
14
+ test:
15
+ runs-on: ${{ matrix.os }}
16
+ env:
17
+ SHOW_VLM_MODEL_OPTIONS: "False"
18
+ strategy:
19
+ matrix:
20
+ os: [ubuntu-latest, macos-latest] # windows-latest, not included as tesseract cannot be installed silently
21
+ python-version: ["3.11", "3.12", "3.13"]
22
+ exclude:
23
+ # Exclude some combinations to reduce CI time
24
+ #- os: windows-latest
25
+ # python-version: ["3.12", "3.13"]
26
+ - os: macos-latest
27
+ python-version: ["3.12", "3.13"]
28
+
29
+ steps:
30
+ - uses: actions/checkout@v6
31
+
32
+ - name: Set up Python ${{ matrix.python-version }}
33
+ uses: actions/setup-python@v6
34
+ with:
35
+ python-version: ${{ matrix.python-version }}
36
+
37
+ - name: Install system dependencies (Ubuntu)
38
+ if: matrix.os == 'ubuntu-latest'
39
+ run: |
40
+ sudo apt-get update
41
+ sudo apt-get install -y \
42
+ tesseract-ocr \
43
+ tesseract-ocr-eng \
44
+ poppler-utils \
45
+ libgl1-mesa-dri \
46
+ libglib2.0-0 \
47
+ libsm6 \
48
+ libxext6 \
49
+ libxrender-dev \
50
+ libgomp1
51
+
52
+ - name: Install system dependencies (macOS)
53
+ if: matrix.os == 'macos-latest'
54
+ run: |
55
+ brew install tesseract poppler
56
+
57
+ - name: Install system dependencies (Windows)
58
+ if: matrix.os == 'windows-latest'
59
+ run: |
60
+ # Create tools directory
61
+ if (!(Test-Path "C:\tools")) {
62
+ mkdir C:\tools
63
+ }
64
+
65
+ # Download and install Tesseract
66
+ $tesseractUrl = "https://github.com/tesseract-ocr/tesseract/releases/download/5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe"
67
+ $tesseractInstaller = "C:\tools\tesseract-installer.exe"
68
+ Invoke-WebRequest -Uri $tesseractUrl -OutFile $tesseractInstaller
69
+
70
+ # Install Tesseract silently
71
+ Start-Process -FilePath $tesseractInstaller -ArgumentList "/S", "/D=C:\tools\tesseract" -Wait
72
+
73
+ # Download and extract Poppler
74
+ $popplerUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.07.0-0/Release-25.07.0-0.zip"
75
+ $popplerZip = "C:\tools\poppler.zip"
76
+ Invoke-WebRequest -Uri $popplerUrl -OutFile $popplerZip
77
+
78
+ # Extract Poppler
79
+ Expand-Archive -Path $popplerZip -DestinationPath C:\tools\poppler -Force
80
+
81
+ # Add to PATH
82
+ echo "C:\tools\tesseract" >> $env:GITHUB_PATH
83
+ echo "C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_PATH
84
+
85
+ # Set environment variables for your application
86
+ echo "TESSERACT_FOLDER=C:\tools\tesseract" >> $env:GITHUB_ENV
87
+ echo "POPPLER_FOLDER=C:\tools\poppler\poppler-25.07.0\Library\bin" >> $env:GITHUB_ENV
88
+ echo "TESSERACT_DATA_FOLDER=C:\tools\tesseract\tessdata" >> $env:GITHUB_ENV
89
+
90
+ # Verify installation using full paths (since PATH won't be updated in current session)
91
+ & "C:\tools\tesseract\tesseract.exe" --version
92
+ & "C:\tools\poppler\poppler-25.07.0\Library\bin\pdftoppm.exe" -v
93
+
94
+ - name: Install Python dependencies
95
+ run: |
96
+ python -m pip install --upgrade pip
97
+ pip install -r requirements.txt
98
+ pip install pytest pytest-cov reportlab pillow
99
+
100
+ - name: Download spaCy model
101
+ run: |
102
+ python -m spacy download en_core_web_lg
103
+
104
+ - name: Setup test data
105
+ run: |
106
+ python .github/scripts/setup_test_data.py
107
+
108
+ - name: Run CLI tests
109
+ run: |
110
+ cd test
111
+ python cli_epilog_suite.py
112
+
113
+ - name: Run tests with pytest
114
+ run: |
115
+ pytest test/ -v --tb=short
.github/workflows/ci.yml ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+ #schedule:
10
+ # Run tests daily at 2 AM UTC
11
+ # - cron: '0 2 * * *'
12
+
13
+ permissions:
14
+ contents: read
15
+ actions: read
16
+ pull-requests: write
17
+ issues: write
18
+
19
+ env:
20
+ PYTHON_VERSION: "3.11"
21
+
22
+ jobs:
23
+ lint:
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - uses: actions/checkout@v6
27
+
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v6
30
+ with:
31
+ python-version: ${{ env.PYTHON_VERSION }}
32
+
33
+ - name: Install dependencies
34
+ run: |
35
+ python -m pip install --upgrade pip
36
+ pip install ruff black
37
+
38
+ - name: Run Ruff linter
39
+ run: ruff check .
40
+
41
+ - name: Run Black formatter check
42
+ run: black --check .
43
+
44
+ test-unit:
45
+ runs-on: ubuntu-latest
46
+ env:
47
+ # Avoid optional VLM/torch import path in tools.run_vlm (not installed in lightweight CI deps)
48
+ SHOW_VLM_MODEL_OPTIONS: "False"
49
+ strategy:
50
+ matrix:
51
+ python-version: [3.11, 3.12, 3.13]
52
+
53
+ steps:
54
+ - uses: actions/checkout@v6
55
+
56
+ - name: Set up Python ${{ matrix.python-version }}
57
+ uses: actions/setup-python@v6
58
+ with:
59
+ python-version: ${{ matrix.python-version }}
60
+
61
+ - name: Cache pip dependencies
62
+ uses: actions/cache@v5
63
+ with:
64
+ path: ~/.cache/pip
65
+ key: ${{ runner.os }}-pip-${{ hashFiles('requirements_lightweight.txt') }}
66
+ restore-keys: |
67
+ ${{ runner.os }}-pip-
68
+
69
+ - name: Install system dependencies
70
+ run: |
71
+ sudo apt-get update
72
+ sudo apt-get install -y \
73
+ tesseract-ocr \
74
+ tesseract-ocr-eng \
75
+ poppler-utils \
76
+ libgl1-mesa-dri \
77
+ libglib2.0-0 \
78
+ libsm6 \
79
+ libxext6 \
80
+ libxrender-dev \
81
+ libgomp1
82
+
83
+ - name: Install Python dependencies
84
+ run: |
85
+ python -m pip install --upgrade pip
86
+ pip install -r requirements_lightweight.txt
87
+ pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
88
+
89
+ - name: Download spaCy model
90
+ run: |
91
+ python -m spacy download en_core_web_lg
92
+
93
+ - name: Setup test data
94
+ run: |
95
+ python .github/scripts/setup_test_data.py
96
+ echo "Setup script completed. Checking results:"
97
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
98
+
99
+ - name: Verify test data files
100
+ run: |
101
+ echo "Checking if critical test files exist:"
102
+ ls -la doc_redaction/example_data/
103
+ echo "Checking for specific PDF files:"
104
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
105
+ echo "Checking file sizes:"
106
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
107
+
108
+ - name: Clean up problematic config files
109
+ run: |
110
+ rm -f config*.py || true
111
+
112
+ - name: Run CLI tests
113
+ run: |
114
+ cd test
115
+ python cli_epilog_suite.py
116
+
117
+ - name: Run tests with pytest (JUnit and coverage)
118
+ run: |
119
+ pytest test/ -v --tb=short \
120
+ --junitxml=test-results.xml \
121
+ --cov=. --cov-config=.coveragerc \
122
+ --cov-report=xml --cov-report=html --cov-report=term
123
+
124
+ #- name: Upload coverage to Codecov - not necessary
125
+ # uses: codecov/codecov-action@v3
126
+ # if: matrix.python-version == '3.11'
127
+ # with:
128
+ # file: ./coverage.xml
129
+ # flags: unittests
130
+ # name: codecov-umbrella
131
+ # fail_ci_if_error: false
132
+
133
+ - name: Upload test results
134
+ uses: actions/upload-artifact@v6
135
+ if: always()
136
+ with:
137
+ name: test-results-python-${{ matrix.python-version }}
138
+ path: |
139
+ test-results.xml
140
+ htmlcov/
141
+ coverage.xml
142
+
143
+ test-integration:
144
+ runs-on: ubuntu-latest
145
+ needs: [lint, test-unit]
146
+ env:
147
+ SHOW_VLM_MODEL_OPTIONS: "False"
148
+
149
+ steps:
150
+ - uses: actions/checkout@v6
151
+
152
+ - name: Set up Python
153
+ uses: actions/setup-python@v6
154
+ with:
155
+ python-version: ${{ env.PYTHON_VERSION }}
156
+
157
+ - name: Install dependencies
158
+ run: |
159
+ python -m pip install --upgrade pip
160
+ pip install -r requirements_lightweight.txt
161
+ pip install pytest pytest-cov reportlab pillow
162
+
163
+ - name: Install system dependencies
164
+ run: |
165
+ sudo apt-get update
166
+ sudo apt-get install -y \
167
+ tesseract-ocr \
168
+ tesseract-ocr-eng \
169
+ poppler-utils \
170
+ libgl1-mesa-dri \
171
+ libglib2.0-0 \
172
+ libsm6 \
173
+ libxext6 \
174
+ libxrender-dev \
175
+ libgomp1
176
+
177
+ - name: Download spaCy model
178
+ run: |
179
+ python -m spacy download en_core_web_lg
180
+
181
+ - name: Setup test data
182
+ run: |
183
+ python .github/scripts/setup_test_data.py
184
+ echo "Setup script completed. Checking results:"
185
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
186
+
187
+ - name: Verify test data files
188
+ run: |
189
+ echo "Checking if critical test files exist:"
190
+ ls -la doc_redaction/example_data/
191
+ echo "Checking for specific PDF files:"
192
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
193
+ echo "Checking file sizes:"
194
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
195
+
196
+ - name: Run integration tests
197
+ run: |
198
+ cd test
199
+ python demo_single_test.py
200
+
201
+ - name: Test CLI help
202
+ run: |
203
+ python cli_redact.py --help
204
+
205
+ - name: Test CLI version
206
+ run: |
207
+ python -c "import sys; print(f'Python {sys.version}')"
208
+
209
+ security:
210
+ runs-on: ubuntu-latest
211
+ steps:
212
+ - uses: actions/checkout@v6
213
+
214
+ - name: Set up Python
215
+ uses: actions/setup-python@v6
216
+ with:
217
+ python-version: ${{ env.PYTHON_VERSION }}
218
+
219
+ - name: Install dependencies
220
+ run: |
221
+ python -m pip install --upgrade pip
222
+ pip install safety bandit
223
+
224
+ #- name: Run safety scan - removed as now requires login
225
+ # run: |
226
+ # safety scan -r requirements.txt
227
+
228
+ - name: Run bandit security check
229
+ run: |
230
+ bandit -r . -f json -o bandit-report.json || true
231
+
232
+ - name: Upload security report
233
+ uses: actions/upload-artifact@v6
234
+ if: always()
235
+ with:
236
+ name: security-report
237
+ path: bandit-report.json
238
+
239
+ build:
240
+ runs-on: ubuntu-latest
241
+ needs: [lint, test-unit]
242
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
243
+
244
+ steps:
245
+ - uses: actions/checkout@v6
246
+
247
+ - name: Set up Python
248
+ uses: actions/setup-python@v6
249
+ with:
250
+ python-version: ${{ env.PYTHON_VERSION }}
251
+
252
+ - name: Install build dependencies
253
+ run: |
254
+ python -m pip install --upgrade pip
255
+ pip install build twine
256
+
257
+ - name: Build package
258
+ run: |
259
+ python -m build
260
+
261
+ - name: Check package
262
+ run: |
263
+ twine check dist/*
264
+
265
+ - name: Upload build artifacts
266
+ uses: actions/upload-artifact@v6
267
+ with:
268
+ name: dist
269
+ path: dist/
.github/workflows/simple-test.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Simple Test Run
2
+
3
+ on:
4
+ push:
5
+ branches: [ dev ]
6
+ pull_request:
7
+ branches: [ dev ]
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+ actions: read
13
+
14
+ jobs:
15
+ test:
16
+ runs-on: ubuntu-latest
17
+ env:
18
+ SHOW_VLM_MODEL_OPTIONS: "False"
19
+
20
+ steps:
21
+ - uses: actions/checkout@v6
22
+
23
+ - name: Set up Python 3.12
24
+ uses: actions/setup-python@v6
25
+ with:
26
+ python-version: "3.12"
27
+
28
+ - name: Install system dependencies
29
+ run: |
30
+ sudo apt-get update
31
+ sudo apt-get install -y \
32
+ tesseract-ocr \
33
+ tesseract-ocr-eng \
34
+ poppler-utils \
35
+ libgl1-mesa-dri \
36
+ libglib2.0-0 \
37
+ libsm6 \
38
+ libxext6 \
39
+ libxrender-dev \
40
+ libgomp1
41
+
42
+ - name: Install Python dependencies
43
+ run: |
44
+ python -m pip install --upgrade pip
45
+ pip install -r requirements_lightweight.txt
46
+ pip install pytest pytest-cov reportlab pillow
47
+
48
+ - name: Download spaCy model
49
+ run: |
50
+ python -m spacy download en_core_web_lg
51
+
52
+ - name: Setup test data
53
+ run: |
54
+ python .github/scripts/setup_test_data.py
55
+ echo "Setup script completed. Checking results:"
56
+ ls -la doc_redaction/example_data/ || echo "doc_redaction/example_data directory not found"
57
+
58
+ - name: Verify test data files
59
+ run: |
60
+ echo "Checking if critical test files exist:"
61
+ ls -la doc_redaction/example_data/
62
+ echo "Checking for specific PDF files:"
63
+ ls -la doc_redaction/example_data/*.pdf || echo "No PDF files found"
64
+ echo "Checking file sizes:"
65
+ find doc_redaction/example_data -name "*.pdf" -exec ls -lh {} \;
66
+
67
+ - name: Run CLI tests
68
+ run: |
69
+ cd test
70
+ python cli_epilog_suite.py
71
+
72
+ - name: Run tests with pytest
73
+ run: |
74
+ pytest test/ -v --tb=short
.github/workflows/sync-pi-agent-space.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync Pi agent to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [dev]
6
+ paths:
7
+ - "agent-redact/**"
8
+ - "skills/**"
9
+ - "tools/**"
10
+ - "intros/**"
11
+ - "doc_redaction/example_data/**"
12
+ - "AGENTS.md"
13
+ - "config/**"
14
+ - ".github/workflows/sync-pi-agent-space.yml"
15
+ workflow_dispatch:
16
+
17
+ permissions:
18
+ contents: read
19
+
20
+ jobs:
21
+ sync-pi-agent-space:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v6
25
+ with:
26
+ fetch-depth: 1
27
+ lfs: true
28
+
29
+ - name: Install Git LFS
30
+ run: git lfs install
31
+
32
+ - name: Materialize example PDFs (Git LFS)
33
+ run: |
34
+ git lfs pull --include="doc_redaction/example_data/*.pdf"
35
+ for f in \
36
+ doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
37
+ doc_redaction/example_data/graduate-job-example-cover-letter.pdf; do
38
+ if head -1 "$f" | grep -q "^version https://git-lfs.github.com/spec/v1"; then
39
+ echo "Example PDF is still an LFS pointer (not materialized): $f" >&2
40
+ exit 1
41
+ fi
42
+ done
43
+
44
+ - name: Flatten Pi agent Space tree
45
+ run: |
46
+ chmod +x agent-redact/pi-agent/sync_to_space.sh
47
+ agent-redact/pi-agent/sync_to_space.sh /tmp/pi-agent-space
48
+
49
+ - name: Push to Hugging Face Space
50
+ run: |
51
+ COMMIT_MSG=$(git log -1 --pretty=%B)
52
+ echo "Syncing Pi agent Space: seanpedrickcase/agentic_document_redaction"
53
+ cd /tmp/pi-agent-space
54
+ git init -b main
55
+ git config user.name "$HF_USERNAME"
56
+ git config user.email "$HF_EMAIL"
57
+ git add .
58
+ git commit -m "Sync Pi agent Space: $COMMIT_MSG"
59
+ git remote add hf "https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/agentic_document_redaction"
60
+ git push --force hf main
61
+ env:
62
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
63
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
64
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [dev]
5
+ workflow_dispatch:
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ sync-to-hub:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+ with:
16
+ fetch-depth: 1 # Only get the latest state
17
+ lfs: true # Download actual LFS files so they can be pushed
18
+
19
+ - name: Install Git LFS
20
+ run: git lfs install
21
+
22
+ - name: Recreate repo history (single-commit force push)
23
+ run: |
24
+ # 1. Capture the message BEFORE we delete the .git folder
25
+ COMMIT_MSG=$(git log -1 --pretty=%B)
26
+ echo "Syncing commit message: $COMMIT_MSG"
27
+
28
+ # 2. DELETE the .git folder.
29
+ # This turns the repo into a standard folder of files.
30
+ rm -rf .git
31
+
32
+ # 3. Re-initialize a brand new git repo
33
+ git init -b main
34
+ git config --global user.name "$HF_USERNAME"
35
+ git config --global user.email "$HF_EMAIL"
36
+
37
+ # 4. Re-install LFS (needs to be done after git init)
38
+ git lfs install
39
+
40
+ # 5. Add the remote
41
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID
42
+
43
+ # 6. Add all files
44
+ # Since this is a fresh init, Git sees EVERY file as "New"
45
+ git add .
46
+
47
+ # 7. Commit and Force Push
48
+ git commit -m "Sync: $COMMIT_MSG"
49
+ git push --force hf main
50
+ env:
51
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
52
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
53
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
54
+ HF_REPO_ID: ${{ secrets.HF_REPO_ID }}
.github/workflows/sync_to_hf_zero_gpu.yml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub Zero GPU
2
+ on:
3
+ push:
4
+ branches: [dev]
5
+ workflow_dispatch:
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ sync-to-hub-zero-gpu:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+ with:
16
+ fetch-depth: 1 # Only get the latest state
17
+ lfs: true # Download actual LFS files so they can be pushed
18
+
19
+ - name: Install Git LFS
20
+ run: git lfs install
21
+
22
+ # HF Spaces read Space config from README.md front matter. The repo README
23
+ # targets GitHub (e.g. docker); patch only this CI checkout before HF push.
24
+ - name: Apply HF Zero GPU Space README front matter
25
+ run: python3 tools/apply_hf_zero_gpu_readme_frontmatter.py
26
+
27
+ - name: Recreate repo history (single-commit force push)
28
+ run: |
29
+ # 1. Capture the message BEFORE we delete the .git folder
30
+ COMMIT_MSG=$(git log -1 --pretty=%B)
31
+ echo "Syncing commit message: $COMMIT_MSG"
32
+
33
+ # 2. DELETE the .git folder.
34
+ # This turns the repo into a standard folder of files.
35
+ rm -rf .git
36
+
37
+ # 3. Re-initialize a brand new git repo
38
+ git init -b main
39
+ git config --global user.name "$HF_USERNAME"
40
+ git config --global user.email "$HF_EMAIL"
41
+
42
+ # 4. Re-install LFS (needs to be done after git init)
43
+ git lfs install
44
+
45
+ # 5. Add the remote
46
+ git remote add hf https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_REPO_ID_ZERO_GPU
47
+
48
+ # 6. Add all files
49
+ # Since this is a fresh init, Git sees EVERY file as "New"
50
+ git add .
51
+
52
+ # 7. Commit and Force Push
53
+ git commit -m "Sync: $COMMIT_MSG"
54
+ git push --force hf main
55
+ env:
56
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
57
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
58
+ HF_EMAIL: ${{ secrets.HF_EMAIL }}
59
+ HF_REPO_ID_ZERO_GPU: ${{ secrets.HF_REPO_ID_ZERO_GPU }}
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.url
2
+ *.ipynb
3
+ *.pyc
4
+ *.qmd
5
+ _quarto.yml
6
+ quarto_site/*
7
+ src/*
8
+ redaction_deps/*
9
+ .venv/*
10
+ examples/*
11
+ processing/*
12
+ input/*
13
+ output/*
14
+ tools/__pycache__/*
15
+ old_code/*
16
+ tesseract/*
17
+ poppler/*
18
+ build/*
19
+ dist/*
20
+ build_deps/*
21
+ logs/*
22
+ usage/*
23
+ feedback/*
24
+ config/*
25
+ !config/pi_agent.env.example
26
+ !config/docker_app_config.env.example
27
+ !config/app_config.env.example
28
+ workspace/*
29
+ user_guide/*
30
+ _extensions/*
31
+ doc_redaction.egg-info/*
32
+ .venv_pypi_test/*
33
+ cdk/config/*
34
+ cdk/cdk.out/*
35
+ cdk/archive/*
36
+ tld/*
37
+ tmp/*
38
+ docs/*
39
+ .pi/*
40
+ cdk.out/*
41
+ cdk.json
42
+ cdk.context.json
43
+ precheck.context.json
44
+ .quarto/*
45
+ /.quarto/
46
+ /_site/
47
+ test/config/*
48
+ test/feedback/*
49
+ test/input/*
50
+ test/logs/*
51
+ test/output/*
52
+ test/tmp/*
53
+ test/usage/*
54
+ .ruff_cache/*
55
+ model_cache/*
56
+ sanitized_file/*
57
+ src/doc_redaction.egg-info/*
58
+ docker_compose/*
59
+ **/*.quarto_ipynb
60
+ skills/example_prompts/*
61
+ .pi/sessions/
62
+ agent-redact/pi/agent/sessions/
AGENTS.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AGENTS.md
2
+
3
+ Context for AI coding agents working on **doc_redaction** (PII redaction for PDFs, images, Word, and tabular files). Human-oriented docs: [README.md](README.md). User guide: [doc_redaction user guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
4
+
5
+ ## Project overview
6
+
7
+ - **Stack**: Python 3.10+, Gradio UI ([app.py](app.py)), optional FastAPI when `RUN_FASTAPI` is enabled, AWS/LLM integrations via [tools/config.py](tools/config.py) and env files under `config/`.
8
+ - **License**: AGPL-3.0-only (see [pyproject.toml](pyproject.toml)). Respect license terms when adding dependencies.
9
+ - **Accuracy**: Outputs are not guaranteed complete; downstream use should assume **human review** of redacted material.
10
+
11
+ ## Cursor skills: redaction workflow (optional)
12
+
13
+ For agents operating the deployed app (Gradio Client, review CSV, `/review_apply`), these repo-local playbooks are a suggested ladder:
14
+
15
+ 0. **[`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md)** — copy-paste user task prompt (Pass 1 default, Pass 2 gated); **user redaction requirements go at the end of the prompt**.
16
+ 1. **[`skills/doc-redaction-app/SKILL.md`](skills/doc-redaction-app/SKILL.md)** — first-pass redaction (`/doc_redact` / `/redact_document`) and downloading artifacts.
17
+ 2. **[`skills/doc-redact-page-review/SKILL.md`](skills/doc-redact-page-review/SKILL.md)** — after outputs exist: **parallel per-page** child agents, merge into one full-document `*_review_file.csv`, **single** `/review_apply` from the parent.
18
+ 3. **[`skills/doc-redaction-modifications/SKILL.md`](skills/doc-redaction-modifications/SKILL.md)** — CSV mechanics, `preview_redaction_boxes`, `/review_apply` patterns, verification, VLM and PyMuPDF fallbacks (single-thread edits and the **technical** reference for page-review children).
19
+
20
+ ## Setup
21
+
22
+ 1. **System**: Install **Tesseract** and **Poppler** (required for OCR/PDF). See [README.md](README.md) (Windows/Linux sections).
23
+ 2. **Python**: Create a venv, then install the project (e.g. `pip install -e ".[dev]"` or follow README).
24
+ 3. **Configuration**: Copy or edit environment/config as described in README / `config/` (e.g. `app_config.env`). Do not commit secrets.
25
+
26
+ ## Run locally
27
+
28
+ - Gradio/FastAPI entrypoint is [app.py](app.py). With FastAPI enabled, typical pattern is `uvicorn app:app --host 0.0.0.0 --port 7860` (exact host/port from your config).
29
+ - OpenAPI docs: `/docs` when the FastAPI app is mounted.
30
+
31
+ ## Tests
32
+
33
+ - Run from repo root: `pytest` (optional: `pytest test/`).
34
+ - Fix failures related to your changes before opening a PR.
35
+
36
+ ## Line order (local OCR and simple text extraction)
37
+
38
+ Multi-column layouts use shared logic in [`tools/ocr_reading_order.py`](tools/ocr_reading_order.py). Controlled by **`LOCAL_OCR_READING_ORDER`** (`column` default, `legacy` for previous top-left behaviour).
39
+
40
+ ### Local OCR (Paddle/Tesseract)
41
+
42
+ Word boxes are merged into line-level CSV rows in [`combine_ocr_results`](tools/custom_image_analyser_engine.py).
43
+
44
+ - **`column`**: detect text columns, assign line numbers down each column left-to-right; full-width lines (headers) first. Stops cross-column merging that produced wide erroneous lines on multi-column PDFs. **Auto-fallback**: the page is treated as single-column unless a *consecutive cluster* of gutter rows (y-gap between adjacent rows ≤ `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, default `0.06` of page height) has ≥ `OCR_COLUMN_MIN_GUTTER_ROWS` (default `3`) rows **and** the cluster's topmost row is above the footer zone (`OCR_COLUMN_FOOTER_ZONE_FRACTION`, default `0.75`). This prevents isolated header bands (logo | title, 1 gutter row), signature-only blocks at the page bottom (cluster starts at y ≥ 0.75), or the combination of both, from forcing column mode on the single-column body text between them.
45
+ - **`PADDLE_PRESERVE_LINE_BOXES=True`** or **`CONVERT_LINE_TO_WORD_LEVEL=False`** with Paddle: keep Paddle line boxes (skip word split + regrouping); line numbers still use column reading order.
46
+
47
+ ### Simple text extraction (PyMuPDF)
48
+
49
+ [`redact_text_pdf`](tools/file_redaction.py) → [`process_page_to_structured_ocr_pymupdf`](tools/file_redaction.py) calls [`reorder_structured_text_lines`](tools/ocr_reading_order.py) after collecting lines, using **`page.mediabox`** width/height for full-span header detection.
50
+
51
+ `reorder_structured_text_lines` now mirrors `build_line_groups` (local OCR route):
52
+
53
+ 1. **Column-aware sort** (`sort_reading_order` / `assign_layout_boxes` / `detect_column_split_xpoints`) — or legacy top-left for single-column pages.
54
+ 2. **Y-band grouping** (`group_into_lines`) — merges any same-row PyMuPDF lines that were emitted as separate objects (e.g. mixed-font spans) and splits horizontally-disparate boxes via `_finalize_line`. *Column mode only.*
55
+ 3. **Secondary sub-column pass** (`_reorder_lines_column_major`) — ensures correct column-major order when sub-columns sit within a single macro-column. *Column mode only.*
56
+ 4. When a group contains more than one box, constituent boxes are **merged** into a single `OCRResult` (union bbox, joined text, concatenated chars/words).
57
+
58
+ In single-column / legacy mode only step 1 is applied; PyMuPDF lines are pre-formed so no merging is needed.
59
+
60
+ ### Tunables (both routes)
61
+
62
+ `OCR_FULL_SPAN_WIDTH_RATIO`, `OCR_COLUMN_GAP_MIN_FRACTION`, `OCR_COLUMN_GUTTER_MIN_FRACTION`, `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` (default `0.015` — fine-grained gutter scan in `assign_layout_boxes`; lower = detects narrower sub-column boundaries), `OCR_COLUMN_MIN_GUTTER_ROWS`, `OCR_COLUMN_MAX_BOX_HEIGHT_RATIO`, `OCR_COLUMN_MAX_CONSECUTIVE_GUTTER_GAP`, `OCR_COLUMN_FOOTER_ZONE_FRACTION`, `OCR_LINE_SPLIT_GAP_FRACTION` (default 0.025 — horizontal gap fraction that forces a line split; must be below the narrowest column gutter, ~0.030 for two-page spreads; also used as the gap threshold for the secondary sub-column sort in `build_line_groups`), `OCR_LINE_Y_THRESHOLD_FRACTION` (default 0.013 — row-alignment tolerance as a fraction of page height; reduced from 0.015 to correctly separate tightly-set 10 pt body text whose row spacing is ~0.014), `OCR_LINE_Y_THRESHOLD_MIN_PX`.
63
+
64
+ **Sub-column ordering** (`build_line_groups`): after the primary word-level column sort, a second pass (`_reorder_lines_column_major`) clusters the produced line groups by their leftmost x-position using `OCR_LINE_SPLIT_GAP_FRACTION` as the gap threshold. This ensures that adjacent narrow sub-columns whose word-level centre gap is below `column_gap_threshold` (e.g. two columns on a spread where each page is already one macro-column) are still output in left-to-right column-major order rather than interleaved by y-position.
65
+
66
+ **Fine-grained gutter-based column assignment** (`assign_layout_boxes`): before falling back to centre-gap clustering, `detect_column_split_xpoints` scans the page for structural gutters at the finer `OCR_COLUMN_SUBGUTTER_MIN_FRACTION` threshold (default 0.015). Each qualifying gutter cluster produces a `(split_x, y_min)` pair — the split point is only applied to boxes whose `top ≥ y_min`, preventing a narrow sub-column gutter (visible only in the lower two-column section) from mis-splitting a full-width introductory paragraph that sits above it. This correctly separates narrow adjacent columns (e.g. 1.9 % gutter on a two-page spread) without fragmenting full-width headings or paragraphs.
67
+
68
+ Changing line order affects PII page text, duplicate-page detection, and review CSV line indices on multi-column documents; re-review after upgrading.
69
+
70
+ ## Agentic / programmatic access (two surfaces)
71
+
72
+ ### 1. FastAPI Agent API (recommended for LLM agents: small JSON bodies)
73
+
74
+ When `RUN_FASTAPI` is true, routes are mounted under **`/agent`** ([agent_routes.py](agent_routes.py)).
75
+
76
+ - **Catalog**: `GET /agent/operations` — maps each Gradio `api_name` to an HTTP path and notes whether the route is implemented via CLI or returns HTTP 501 for Gradio-only flows.
77
+ - **Implemented POST routes** (CLI- or [tools/simplified_api.py](tools/simplified_api.py)-backed where noted):
78
+ `redact_document`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_pdfs`, `combine_review_csvs`, `export_review_redaction_overlay`, `export_review_page_ocr_visualisation`, `apply_review_redactions`, **`verify_redaction_coverage`** (Pass 1 QA: `must_redact` / `must_not_redact` regex lists, optional `redacted_pdf_path`, optional `auto_prune_suspicious` + `pruned_output_path`; returns `pass_strict`, `pass_with_cleanup`, `pages_flagged_for_vlm`, `pages_needing_csv_cleanup`), **`word_level_ocr_text_search`** (headless word OCR search with optional review-box overlap flags).
79
+
80
+ **Optional post-redaction Pass 1 QA (main app / CLI):** When `POST_REDACT_PASS1_QA=True` in [`tools/config.py`](tools/config.py) (or `config/app_config.env`), initial redaction emits `*_coverage_report.json` beside the review CSV and optionally `*_review_file_pruned.csv` (sibling, when `POST_REDACT_PASS1_AUTO_PRUNE=True`). Uses deny/allow lists and/or `POST_REDACT_PASS1_MUST_REDACT_PATH` / `POST_REDACT_PASS1_MUST_NOT_REDACT_PATH`. CLI overrides: `--post-redact-pass1-qa`, `--post-redact-pass1-auto-prune`. This is pre-review-apply sanity QA only — agent Pass 1 (policy edits + `/review_apply`) remains separate.
81
+ Note: on Gradio ([app.py](app.py)), the Review-tab visual exports use `api_name` **`page_redaction_review_image`** and **`page_ocr_review_image`**; the **`/agent`** routes above keep the explicit `export_review_*` names for the same operations.
82
+ - **Gradio-only stubs** (501 + JSON hint): `load_and_prepare_documents_or_data`.
83
+ - **Auth**: If `AGENT_API_KEY` is set in the environment, send header `X-Agent-API-Key` with that value.
84
+ - **Paths**: Inputs must resolve to files under the repo root, `INPUT_FOLDER`, or `OUTPUT_FOLDER` (see router validation).
85
+
86
+ Implementation uses **`cli_redact.main(direct_mode_args=...)`** where a CLI task exists (same behaviour as [cli_redact.py](cli_redact.py)); `apply_review_redactions` calls [tools/simplified_api.py](tools/simplified_api.py) instead.
87
+
88
+ ### 2. Gradio Client API (e.g. Hugging Face Spaces)
89
+
90
+ For remote Spaces or any Gradio deployment exposing the HTTP API:
91
+
92
+ - **Schema**: `GET https://<host>/gradio_api/info`
93
+ - **Call**: `POST https://<host>/gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order matches the named endpoint’s component list).
94
+ - **Poll**: `GET https://<host>/gradio_api/call/{api_name}/{event_id}`
95
+ - **Hugging Face**: `Authorization: Bearer $HF_TOKEN`
96
+
97
+ Named `api_name` values in this app include: `redact_document`, `load_and_prepare_documents_or_data`, `apply_review_redactions`, **`doc_redact`** (simple `gr.api`: one PDF/image + optional OCR/PII knobs; returns `(output_paths, message)`; `api_name='/doc_redact'`; parameters include `document_file`, `redact_entities`, `output_dir`, `ocr_method`, `pii_method`, `allow_list`, `deny_list`, `page_min`, `page_max`, **`handwrite_signature_checkbox`** — AWS Textract extraction options such as `Extract handwriting` / `Extract signatures`), **`review_apply`** (simple `gr.api`: PDF + `*_review_file.csv`; returns `(output_paths, message)`; `api_name='/review_apply'`), **`preview_boxes`** (simple `gr.api`: PDF + `*_review_file.csv`; renders proposed boxes onto the original PDF and returns `(zip_path, message)` — use to verify coordinates *before* calling `review_apply`, no redaction applied; `api_name='/preview_boxes'`), **`pdf_summarise`** (simple `gr.api`: PDF + optional summarisation/OCR knobs; returns `(output_paths, status_message, summary_text)`; `api_name='/pdf_summarise'`), **`tabular_redact`** (simple `gr.api`: one tabular file (CSV/XLSX/Parquet/DOCX) + optional knobs; returns `(output_paths, message)`; `api_name='/tabular_redact'`), **`page_redaction_review_image`** (short review overlay export; `api_name='/page_redaction_review_image'`), **`page_ocr_review_image`** (short OCR visualisation export; `api_name='/page_ocr_review_image'`), `word_level_ocr_text_search`, `redact_data`, `find_duplicate_pages`, `find_duplicate_tabular`, `summarise_document`, `combine_review_csvs`, `combine_review_pdfs`. The matching **`POST /agent`** names for those two visual exports are `export_review_redaction_overlay` and `export_review_page_ocr_visualisation` (§1). Many endpoints require **many positional arguments** (full Gradio state); prefer the short `gr.api` routes above or **`POST /agent/apply_review_redactions`** where applicable instead of building the full `data` array from `/gradio_api/info`.
98
+
99
+ ## CLI parity
100
+
101
+ For scripting and tests, `python cli_redact.py` with flags is authoritative; programmatic merges use `get_cli_default_args_dict()` in [cli_redact.py](cli_redact.py).
102
+
103
+ ## Security and data handling
104
+
105
+ - Do not commit API keys, tokens, or customer data.
106
+ - Treat paths as untrusted outside validated roots (see [tools/secure_path_utils.py](tools/secure_path_utils.py)).
107
+ - Optional `instruction` / LLM fields must not be passed into shell or unconstrained config keys.
108
+
109
+ ## Conventions for PRs
110
+
111
+ - Keep changes focused; avoid drive-by refactors.
112
+ - Match existing naming and patterns in [app.py](app.py) and [tools/](tools/).
113
+ - Update tests when behaviour changes; run `pytest` before merge.
Dockerfile ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS builder
3
+
4
+ # Install system dependencies
5
+ RUN apt-get update \
6
+ && apt-get upgrade -y \
7
+ && apt-get install -y --no-install-recommends \
8
+ g++ \
9
+ make \
10
+ cmake \
11
+ unzip \
12
+ libcurl4-openssl-dev \
13
+ git \
14
+ && pip install --upgrade pip \
15
+ && apt-get clean \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /src
19
+
20
+ COPY requirements_lightweight.txt .
21
+
22
+ RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
23
+
24
+ ARG INSTALL_GRADIO_MCP=False
25
+ ENV INSTALL_GRADIO_MCP=${INSTALL_GRADIO_MCP}
26
+
27
+ RUN if [ "$INSTALL_GRADIO_MCP" = "True" ]; then \
28
+ pip install --verbose --no-cache-dir --force-reinstall --target=/install "gradio[mcp]<=6.10.0"; \
29
+ fi
30
+
31
+ # Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. Note that GPU-enabled PaddleOCR is unlikely to work in the same environment as a GPU-enabled version of PyTorch, so it is recommended to install PaddleOCR as a CPU-only version if you want to use GPU-enabled PyTorch.
32
+
33
+ ARG INSTALL_PADDLEOCR=False
34
+ ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
35
+
36
+ ARG PADDLE_GPU_ENABLED=False
37
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
38
+
39
+ RUN if [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "False" ]; then \
40
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
41
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle<=3.2.1" && \
42
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
43
+ elif [ "$INSTALL_PADDLEOCR" = "True" ] && [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
44
+ pip install --verbose --no-cache-dir --target=/install "protobuf<=7.34.0" && \
45
+ pip install --verbose --no-cache-dir --target=/install "paddlepaddle-gpu<=3.2.1" --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ && \
46
+ pip install --verbose --no-cache-dir --target=/install "paddleocr<=3.3.0"; \
47
+ fi
48
+
49
+ ARG INSTALL_VLM=False
50
+ ENV INSTALL_VLM=${INSTALL_VLM}
51
+
52
+ ARG TORCH_GPU_ENABLED=False
53
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
54
+
55
+ # Optionally install VLM/LLM packages if the INSTALL_VLM environment variable is set to True.
56
+ RUN if [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "False" ]; then \
57
+ pip install --verbose --no-cache-dir --target=/install \
58
+ "torch==2.9.1+cpu" \
59
+ "torchvision==0.24.1+cpu" \
60
+ "transformers<=5.5.4" \
61
+ "accelerate<=1.13.0" \
62
+ "bitsandbytes<=0.49.2" \
63
+ "sentencepiece<=0.2.1" \
64
+ --extra-index-url https://download.pytorch.org/whl/cpu; \
65
+ elif [ "$INSTALL_VLM" = "True" ] && [ "$TORCH_GPU_ENABLED" = "True" ]; then \
66
+ pip install --verbose --no-cache-dir --target=/install "torch<=2.8.0" --index-url https://download.pytorch.org/whl/cu129 && \
67
+ pip install --verbose --no-cache-dir --target=/install "torchvision<=0.23.0" --index-url https://download.pytorch.org/whl/cu129 && \
68
+ pip install --verbose --no-cache-dir --target=/install \
69
+ "transformers<=5.5.4" \
70
+ "accelerate<=1.13.0" \
71
+ "bitsandbytes<=0.49.2" \
72
+ "sentencepiece<=0.2.1" && \
73
+ pip install --verbose --no-cache-dir --target=/install "optimum<=2.1.0" && \
74
+ pip install --verbose --no-cache-dir --target=/install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl && \
75
+ pip install --verbose --no-cache-dir --target=/install https://github.com/ModelCloud/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.8-cp312-cp312-linux_x86_64.whl; \
76
+ fi
77
+
78
+ # ===================================================================
79
+ # Stage 2: A common base for both Lambda and Gradio
80
+ # ===================================================================
81
+ FROM public.ecr.aws/docker/library/python:3.12.13-slim-trixie AS base
82
+
83
+ # MUST re-declare ARGs in every stage where they are used in RUN commands
84
+ ARG TORCH_GPU_ENABLED=False
85
+ ARG PADDLE_GPU_ENABLED=False
86
+
87
+ ENV TORCH_GPU_ENABLED=${TORCH_GPU_ENABLED}
88
+ ENV PADDLE_GPU_ENABLED=${PADDLE_GPU_ENABLED}
89
+
90
+ RUN apt-get update && apt-get install -y --no-install-recommends \
91
+ tesseract-ocr \
92
+ poppler-utils \
93
+ libgl1 \
94
+ libglib2.0-0 && \
95
+ if [ "$TORCH_GPU_ENABLED" = "True" ] || [ "$PADDLE_GPU_ENABLED" = "True" ]; then \
96
+ apt-get install -y --no-install-recommends libgomp1; \
97
+ fi && \
98
+ apt-get clean && rm -rf /var/lib/apt/lists/*
99
+
100
+ ENV APP_HOME=/home/user
101
+
102
+ # Set env variables for Gradio & other apps
103
+ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
104
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
105
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
106
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
107
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
108
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
109
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
110
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
111
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
112
+ TESSERACT_DATA_FOLDER=/usr/share/tessdata \
113
+ GRADIO_SERVER_NAME=0.0.0.0 \
114
+ GRADIO_SERVER_PORT=7860 \
115
+ PATH=$APP_HOME/.local/bin:$PATH \
116
+ PYTHONPATH=$APP_HOME/app \
117
+ PYTHONUNBUFFERED=1 \
118
+ PYTHONDONTWRITEBYTECODE=1 \
119
+ GRADIO_ALLOW_FLAGGING=never \
120
+ GRADIO_NUM_PORTS=1 \
121
+ GRADIO_ANALYTICS_ENABLED=False
122
+
123
+ # Copy Python packages from the builder stage
124
+ COPY --from=builder /install /usr/local/lib/python3.12/site-packages/
125
+ COPY --from=builder /install/bin /usr/local/bin/
126
+
127
+ # Reinstall protobuf into the final site-packages. Builder uses multiple `pip install --target=/install`
128
+ # passes; that can break the `google` namespace so `google.protobuf` is missing and Paddle fails at import.
129
+ RUN pip install --no-cache-dir "protobuf<=7.34.0"
130
+
131
+ # English pipeline is not a normal PyPI dependency; bundle it in the image so runtime works offline.
132
+ # Placed before COPY app code so application changes do not invalidate this layer.
133
+ RUN python -m spacy download en_core_web_lg
134
+
135
+ # Copy your application code and entrypoint
136
+ COPY . ${APP_HOME}/app
137
+ COPY entrypoint.sh ${APP_HOME}/app/entrypoint.sh
138
+ # Fix line endings and set execute permissions
139
+ RUN sed -i 's/\r$//' ${APP_HOME}/app/entrypoint.sh \
140
+ && chmod +x ${APP_HOME}/app/entrypoint.sh
141
+
142
+ WORKDIR ${APP_HOME}/app
143
+
144
+ # ===================================================================
145
+ # FINAL Stage 3: The Lambda Image (runs as root for simplicity)
146
+ # ===================================================================
147
+ FROM base AS lambda
148
+ # Set runtime ENV for Lambda mode
149
+ ENV APP_MODE=lambda
150
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
151
+ CMD ["lambda_entrypoint.lambda_handler"]
152
+
153
+ # ===================================================================
154
+ # FINAL Stage 4: The Gradio Image (runs as a secure, non-root user)
155
+ # ===================================================================
156
+ FROM base AS gradio
157
+ # Set runtime ENV for Gradio mode
158
+ ENV APP_MODE=gradio
159
+
160
+ # Create non-root user
161
+ RUN useradd -m -u 1000 user
162
+
163
+ # Create the base application directory and set its ownership
164
+ RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
165
+
166
+ # Create required sub-folders within the app directory and set their permissions
167
+ # This ensures these specific directories are owned by 'user'
168
+ RUN mkdir -p \
169
+ ${APP_HOME}/app/output \
170
+ ${APP_HOME}/app/input \
171
+ ${APP_HOME}/app/logs \
172
+ ${APP_HOME}/app/usage \
173
+ ${APP_HOME}/app/feedback \
174
+ ${APP_HOME}/app/config \
175
+ && chown user:user \
176
+ ${APP_HOME}/app/output \
177
+ ${APP_HOME}/app/input \
178
+ ${APP_HOME}/app/logs \
179
+ ${APP_HOME}/app/usage \
180
+ ${APP_HOME}/app/feedback \
181
+ ${APP_HOME}/app/config \
182
+ && chmod 755 \
183
+ ${APP_HOME}/app/output \
184
+ ${APP_HOME}/app/input \
185
+ ${APP_HOME}/app/logs \
186
+ ${APP_HOME}/app/usage \
187
+ ${APP_HOME}/app/feedback \
188
+ ${APP_HOME}/app/config
189
+
190
+ # Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
191
+ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
192
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
193
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
194
+ && chmod 700 ${XDG_CACHE_HOME} \
195
+ && mkdir -p ${APP_HOME}/.paddlex \
196
+ && chown user:user ${APP_HOME}/.paddlex \
197
+ && chmod 755 ${APP_HOME}/.paddlex \
198
+ && mkdir -p ${APP_HOME}/.local/share/spacy/data \
199
+ && chown user:user ${APP_HOME}/.local/share/spacy/data \
200
+ && chmod 755 ${APP_HOME}/.local/share/spacy/data \
201
+ && mkdir -p /usr/share/tessdata \
202
+ && chown user:user /usr/share/tessdata \
203
+ && chmod 755 /usr/share/tessdata
204
+
205
+ # Fix apply user ownership to all files in the home directory
206
+ RUN chown -R user:user /home/user
207
+
208
+ # Set permissions for Python executable
209
+ RUN chmod 755 /usr/local/bin/python
210
+
211
+ # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
212
+ VOLUME ["/tmp/matplotlib_cache"]
213
+ VOLUME ["/tmp/gradio_tmp"]
214
+ VOLUME ["/tmp/tld"]
215
+ VOLUME ["/home/user/app/output"]
216
+ VOLUME ["/home/user/app/input"]
217
+ VOLUME ["/home/user/app/logs"]
218
+ VOLUME ["/home/user/app/usage"]
219
+ VOLUME ["/home/user/app/feedback"]
220
+ VOLUME ["/home/user/app/config"]
221
+ VOLUME ["/home/user/.paddlex"]
222
+ VOLUME ["/home/user/.local/share/spacy/data"]
223
+ VOLUME ["/usr/share/tessdata"]
224
+ VOLUME ["/tmp"]
225
+ VOLUME ["/var/tmp"]
226
+
227
+ USER user
228
+
229
+ EXPOSE $GRADIO_SERVER_PORT
230
+
231
+ ENTRYPOINT ["/home/user/app/entrypoint.sh"]
232
+ CMD ["python", "app.py"]
Dockerfile.pi ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+
3
+ FROM node:22-bookworm-slim
4
+
5
+ ENV NODE_ENV=production
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+ ENV NPM_CONFIG_LOGLEVEL=warn
8
+ ENV PYTHONUNBUFFERED=1
9
+ ENV PYTHONDONTWRITEBYTECODE=1
10
+ ENV PYTHONPATH=/workspace/doc_redaction
11
+
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ bash \
14
+ git \
15
+ curl \
16
+ ca-certificates \
17
+ procps \
18
+ python3 \
19
+ python3-pip \
20
+ python3-venv \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
24
+
25
+ COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
26
+ RUN pip3 install --no-cache-dir --break-system-packages \
27
+ -r /tmp/requirements_pi_agent.txt \
28
+ && rm /tmp/requirements_pi_agent.txt
29
+
30
+ RUN mkdir -p /home/node/.pi/agent/sessions /workspace/doc_redaction \
31
+ && chown -R node:node /home/node/.pi /workspace
32
+
33
+ WORKDIR /workspace/doc_redaction
34
+
35
+ USER node
36
+
37
+ RUN pi --version
38
+
39
+ ENTRYPOINT ["pi"]
40
+ CMD []
MANIFEST.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ recursive-include doc_redaction/assets *.png
2
+ recursive-include doc_redaction/example_data *
3
+ recursive-include intros *.txt
4
+
README.md ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document redaction
3
+ emoji: 📝
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: true
9
+ license: agpl-3.0
10
+ short_description: OCR / redact PDF documents and tabular data
11
+ ---
12
+ # Document redaction (doc_redaction)
13
+
14
+ <a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
15
+
16
+ Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
17
+
18
+ ---
19
+
20
+ ## 🚀 Quick Start - Installation and first run
21
+
22
+ Follow these instructions to get the document redaction application running on your local machine.
23
+
24
+ ### 1. Package installation
25
+
26
+ #### Option 1 - Recommended: Install from source repo
27
+
28
+ Clone the repository and install in editable mode:
29
+
30
+ ```bash
31
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
32
+ cd doc_redaction
33
+ pip install -e .
34
+ ```
35
+
36
+ ##### Install extras (Paddle or Transformers/Torch VLM)
37
+
38
+ To install with PaddleOCR:
39
+
40
+ ```bash
41
+ pip install -e ".[paddle]"
42
+ ```
43
+
44
+ Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
45
+ ```bash
46
+ pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
47
+ ```
48
+
49
+ If you want to run VLMs / LLMs with the transformers package:
50
+
51
+ ```bash
52
+ pip install -e ".[vlm]"
53
+ ```
54
+
55
+
56
+ **Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
57
+
58
+ ```bash
59
+ pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
60
+ pip install torchvision --index-url https://download.pytorch.org/whl/cu129
61
+ ```
62
+
63
+ #### Option 2 - Install from PyPI
64
+
65
+ Create a virtual environment (recommended) and install **doc_redaction**.
66
+
67
+ ```bash
68
+ python -m venv venv
69
+ # Windows:
70
+ .\venv\Scripts\activate
71
+ # macOS/Linux:
72
+ source venv/bin/activate
73
+ ```
74
+
75
+ The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
76
+
77
+ ```bash
78
+ pip install doc_redaction
79
+ ```
80
+
81
+ Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
82
+
83
+ ```bash
84
+ pip install "doc_redaction[paddle]"
85
+ ```
86
+
87
+ For running VLMs / LLMs with the transformers package:
88
+
89
+ ```bash
90
+ pip install "doc_redaction[vlm]"
91
+ ```
92
+
93
+ For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
94
+
95
+ **Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
96
+
97
+ ```bash
98
+ python -m app
99
+ ```
100
+
101
+ **Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
102
+
103
+ - It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
104
+ - It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
105
+ - The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
106
+
107
+ In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
108
+
109
+ #### Option 3 - Docker installation
110
+
111
+ The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
112
+
113
+ ##### With Llama.cpp / vLLM inference server
114
+
115
+ The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
116
+
117
+ For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
118
+
119
+ You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
120
+
121
+ ##### Without Llama.cpp / vLLM inference server
122
+
123
+ If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
124
+
125
+ ### 2. Install prerequisites: Tesseract and Poppler
126
+
127
+ This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding. To run the Document Redaction app successfully, these tools need to be installed and either 1. added to PATH, or 2. be in a folder that is directly referenced in the config/app_config.env file with the variables TESSERACT_FOLDER and POPPLER_FOLDER (defined [here](https://github.com/seanpedrick-case/doc_redaction/blob/main/tools/config.py) if you want to see the code). The instructions below will guide you through diffferent ways to install these dependencies.
128
+
129
+ ---
130
+
131
+ #### Automated dependency setup (recommended)
132
+
133
+ If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
134
+
135
+ You need the installer script available first, which means either:
136
+
137
+ - **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
138
+ - **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
139
+
140
+ From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
141
+
142
+ ```bash
143
+ python -m doc_redaction.install_deps
144
+ ```
145
+
146
+ This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
147
+
148
+ To just check whether your machine can already see the tools:
149
+
150
+ ```bash
151
+ python -m doc_redaction.install_deps --verify-only
152
+ ```
153
+
154
+ #### **On Windows**
155
+
156
+ If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
157
+
158
+ 1. **Install Tesseract OCR:**
159
+ * Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
160
+ * Run the installer.
161
+ * **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
162
+
163
+
164
+ 2. **Install Poppler:**
165
+ * Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
166
+ * Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
167
+ * You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
168
+ * Search for "Edit the system environment variables" in the Windows Start Menu and open it.
169
+ * Click the "Environment Variables..." button.
170
+ * In the "System variables" section, find and select the `Path` variable, then click "Edit...".
171
+ * Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
172
+ * Click OK on all windows to save the changes.
173
+
174
+ To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
175
+ ---
176
+
177
+ #### **On Linux (Debian/Ubuntu)**
178
+
179
+ Open your terminal and run the following command to install Tesseract and Poppler:
180
+
181
+ ```bash
182
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
183
+ ```
184
+
185
+ #### **On Linux (Fedora/CentOS/RHEL)**
186
+
187
+ Open your terminal and use the `dnf` or `yum` package manager:
188
+
189
+ ```bash
190
+ sudo dnf install -y tesseract poppler-utils
191
+ ```
192
+ ---
193
+
194
+ ### 3. Run the Application
195
+
196
+ With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
197
+
198
+ ```bash
199
+ python app.py
200
+ ```
201
+
202
+ After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
203
+
204
+ Open this URL in your web browser to use the document redaction tool
205
+
206
+ #### Command line interface
207
+
208
+ For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
209
+
210
+ If you installed from **PyPI**, use the installed console script:
211
+
212
+ ```bash
213
+ cli_redact --help
214
+ ```
215
+
216
+ From a **repository checkout**, you can also run:
217
+
218
+ ```bash
219
+ python cli_redact.py --help
220
+ ```
221
+
222
+ #### Python package commands
223
+
224
+ For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
225
+
226
+ ---
227
+
228
+
229
+ ### 4. ⚙️ Configuration (Optional)
230
+
231
+ You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
232
+
233
+ To get started:
234
+ 1. Copy `config/app_config.env.example` to `config/app_config.env`.
235
+ 2. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
236
+
237
+ If you do not create this file, the application will run with default settings.
238
+
239
+ #### Configuration Breakdown
240
+
241
+ Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
242
+
243
+ ---
244
+
245
+ #### **Local & General Settings (No AWS Required)**
246
+
247
+ These settings are useful for all users, regardless of whether you are using AWS.
248
+
249
+ * `TESSERACT_FOLDER` / `POPPLER_FOLDER`
250
+ * Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
251
+ * Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
252
+ * **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
253
+
254
+ * `TESSERACT_DATA_FOLDER`
255
+ * If Tesseract runs but you see an error like `Error opening data file ./eng.traineddata` or `Tesseract couldn't load any languages`, this is usually because it can't find the `tessdata/` language files.
256
+ * Set this to the folder that contains `eng.traineddata` (typically a `tessdata` directory).
257
+ * **Examples (Windows):** `TESSERACT_DATA_FOLDER=C:/Program Files/Tesseract-OCR/tessdata`
258
+
259
+ * `SHOW_LANGUAGE_SELECTION=True`
260
+ * Set to `True` to display a language selection dropdown in the UI for OCR processing.
261
+
262
+ * `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
263
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
264
+
265
+ * `SESSION_OUTPUT_FOLDER=False`
266
+ * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
267
+
268
+ * `DISPLAY_FILE_NAMES_IN_LOGS=False`
269
+ * For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
270
+
271
+ ---
272
+
273
+ #### **AWS-Specific Settings**
274
+
275
+ These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
276
+
277
+ * `RUN_AWS_FUNCTIONS=True`
278
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
279
+
280
+ * **UI Options:**
281
+ * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
282
+ * `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
283
+
284
+ * **Core AWS Configuration:**
285
+ * `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
286
+ * `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
287
+
288
+ * **AWS Logging:**
289
+ * `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
290
+ * `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
291
+
292
+ * **Advanced AWS Textract Features:**
293
+ * `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
294
+ * `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
295
+ * `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
296
+
297
+ * **Cost Tracking (for internal accounting):**
298
+ * `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
299
+ * `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
300
+ * `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
301
+ * `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
302
+
303
+ Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
304
+
305
+ ## For agents (API quickstart)
306
+
307
+ If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
308
+
309
+ - **Discover schema**: `GET /gradio_api/info`
310
+ - **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
311
+ - **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
312
+ - **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
313
+ - **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
314
+
315
+ ### Choose the correct route (prefer short `gr.api` endpoints)
316
+
317
+ Fetch `/gradio_api/info` and then prefer the simplest route that exists:
318
+
319
+ - **Apply edited review CSV to a PDF**: `/review_apply`
320
+ - **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
321
+ - **Summarise a PDF**: `/pdf_summarise`
322
+ - **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
323
+
324
+ If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
325
+
326
+ ### Common gotchas
327
+
328
+ - **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
329
+ - **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
330
+ - **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
331
+
332
+ ### Optional: MCP server
333
+
334
+ If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
335
+
336
+ **Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
337
+
338
+ To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
339
+
340
+ For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
341
+
342
+ Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
343
+
344
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
README_PYPI.md ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Document redaction (doc_redaction)
2
+
3
+ <a href="https://pypi.org/project/doc-redaction/" target="_blank"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/doc-redaction"></a>
4
+
5
+ Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for a full walkthrough of all the features in the app.
6
+
7
+ ---
8
+
9
+ ## 🚀 Quick Start - Installation and first run
10
+
11
+ Follow these instructions to get the document redaction application running on your local machine.
12
+
13
+ ### 1. Package installation
14
+
15
+ #### Option 1 - Recommended: Install from source repo
16
+
17
+ Clone the repository and install in editable mode:
18
+
19
+ ```bash
20
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
21
+ cd doc_redaction
22
+ pip install -e .
23
+ ```
24
+
25
+ ##### Install extras (Paddle or Transformers/Torch VLM)
26
+
27
+ To install with PaddleOCR:
28
+
29
+ ```bash
30
+ pip install -e ".[paddle]"
31
+ ```
32
+
33
+ Note that the versions of both PaddleOCR and Torch installed by default are the CPU-only versions. If you want to install the equivalent GPU versions, you will need to run the following commands:
34
+ ```bash
35
+ pip install paddlepaddle-gpu==3.2.1 --index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
36
+ ```
37
+
38
+ If you want to run VLMs / LLMs with the transformers package:
39
+
40
+ ```bash
41
+ pip install -e ".[vlm]"
42
+ ```
43
+
44
+
45
+ **Note:** It is difficult to get paddlepaddle gpu working in an environment alongside torch. You may well need to reinstall the cpu version to ensure compatibility, and run paddlepaddle-gpu in a separate environment without torch installed. If you get errors related to .dll files following paddle gpu install, you may need to install the latest c++ redistributables. For Windows, you can find them [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170)
46
+
47
+ ```bash
48
+ pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129
49
+ pip install torchvision --index-url https://download.pytorch.org/whl/cu129
50
+ ```
51
+
52
+ #### Option 2 - Install from PyPI
53
+
54
+ Create a virtual environment (recommended) and install **doc_redaction**.
55
+
56
+ ```bash
57
+ python -m venv venv
58
+ # Windows:
59
+ .\venv\Scripts\activate
60
+ # macOS/Linux:
61
+ source venv/bin/activate
62
+ ```
63
+
64
+ The package is published on PyPI as **`doc-redaction`** (import name **`doc_redaction`**):
65
+
66
+ ```bash
67
+ pip install doc_redaction
68
+ ```
69
+
70
+ Optional extras (same as in `pyproject.toml`). For installing paddleOCR:
71
+
72
+ ```bash
73
+ pip install "doc_redaction[paddle]"
74
+ ```
75
+
76
+ For running VLMs / LLMs with the transformers package:
77
+
78
+ ```bash
79
+ pip install "doc_redaction[vlm]"
80
+ ```
81
+
82
+ For programmatic use (CLI-first API matching Gradio `api_name` routes), see **[Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html)**. The console script **`cli_redact`** is available after install.
83
+
84
+ **Web UI from a PyPI install:** You *can* start the Gradio UI after `pip install doc_redaction` by running (note that the prerequisites tesseract and poppler will need to be correctly installed following step 2 below):
85
+
86
+ ```bash
87
+ python -m app
88
+ ```
89
+
90
+ **Important: your working directory matters.** When you run `python -m app`, the app treats your *current folder* as the “app folder”:
91
+
92
+ - It will look for configuration at `config/app_config.env` *relative to the folder you run it from* (and `python -m doc_redaction.install_deps` will also write `config/app_config.env` there).
93
+ - It may create new folders in that location (for example `config/`, `output/`, `input/`, `logs/`, `usage/`, `feedback/`, and temporary/cache folders depending on your settings).
94
+ - The UI example files and bundled assets are packaged with the PyPI install (they live inside the installed `doc_redaction` package). If you run from a “random” directory after a PyPI install, the app can still locate its packaged examples; your working directory mainly affects where `config/`, `input/`, `output/`, logs, and temp folders are created.
95
+
96
+ In practice, the **smoothest UI experience** (examples, bundled assets, docs links, predictable relative paths) is still usually via a **repository checkout** or **Docker**, but PyPI install is sufficient to launch the UI as long as you run it from a suitable working folder and have the system dependencies available (or run `python -m doc_redaction.install_deps` first).
97
+
98
+ #### Option 3 - Docker installation
99
+
100
+ The doc_redaction Redaction app can be installed by using the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) or Docker compose files ([llama.cpp](https://github.com/ggml-org/llama.cpp), [vLLM](https://docs.vllm.ai/en/stable/)) provided in the repo.
101
+
102
+ ##### With Llama.cpp / vLLM inference server
103
+
104
+ The project now has Docker and Docker compose files available to pair running the Redaction app with local inference servers powered by [llama.cpp](https://github.com/ggml-org/llama.cpp), or [vLLM](https://docs.vllm.ai/en/stable/). Llama.cpp is more flexible than vLLM for low VRAM systems, as Llama.cpp will offload to cpu/system RAM automatically rather than failing as vLLM tends to do.
105
+
106
+ For Llama.cpp, you can use the [docker-compose_llama.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_llama.yml) file, and for vLLM, you can use the [docker-compose_vllm.yml](https://github.com/seanpedrick-case/doc_redaction/blob/main/docker-compose_vllm.yml) file. To run, Docker / Docker Desktop should be installed, and then you can run the commands suggested in the top of the files to run the servers.
107
+
108
+ You will need ~40 GB of disk space to run everything depending on the model chosen from the compose file. For the vLLM server, you will need 24 GB VRAM. For the Llama.cpp server, 24 GB VRAM is needed to run at full speed, but the n-gpu-layers and n-cpu-moe parameters in the Docker compose file can be adjusted to fit into your system. I would suggest that 8 GB VRAM is needed as a bare minimum for decent inference speed. See the [Unsloth guide](https://unsloth.ai/docs/models/qwen3.5) for more details on working with GGUF files for Qwen 3.5.
109
+
110
+ ##### Without Llama.cpp / vLLM inference server
111
+
112
+ If you want a working Docker installation without GPU support, you can install from the [Dockerfile](https://github.com/seanpedrick-case/doc_redaction/blob/main/Dockerfile) in the repo. A working example of this, with the CPU version of PaddleOCR, can be found on [Hugging Face](https://huggingface.co/spaces/seanpedrickcase/document_redaction). You can adjust the INSTALL_PADDLEOCR, PADDLE_GPU_ENABLED, INSTALL_VLM, and TORCH_GPU_ENABLED config variables to adjust for PaddleOCR and Transformers packages for local VLM support. Note that GPU-enabled PaddleOCR, and GPU-enabled Transformers/Torch often don't work well together, which is one reason why a Llama.cpp/vLLM inference server Docker installation option is provided below.
113
+
114
+ ### 2. Install prerequisites: Tesseract and Poppler
115
+
116
+ This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
117
+
118
+ ---
119
+
120
+ #### Automated dependency setup (recommended)
121
+
122
+ If you **don’t have admin rights** (or you just want the simplest setup), you can have the project download and configure **Tesseract** and **Poppler** into a local `redaction_deps/` folder inside the doc_redaction folder.
123
+
124
+ You need the installer script available first, which means either:
125
+
126
+ - **Repository checkout**: `git clone ...` and run the command from the repo root (recommended for the web UI), or
127
+ - **PyPI install**: `pip install doc_redaction` and run from a writable folder where you want `redaction_deps/` and `config/app_config.env` to be created/updated.
128
+
129
+ From the repository root (or your chosen working folder) after creating/activating your venv and installing Python requirements:
130
+
131
+ ```bash
132
+ python -m doc_redaction.install_deps
133
+ ```
134
+
135
+ This writes `TESSERACT_FOLDER` / `POPPLER_FOLDER` into `config/app_config.env` so the app can find the binaries without you editing your system PATH.
136
+
137
+ To just check whether your machine can already see the tools:
138
+
139
+ ```bash
140
+ python -m doc_redaction.install_deps --verify-only
141
+ ```
142
+
143
+ #### **On Windows**
144
+
145
+ If you don’t use the automated setup above, you can install the dependencies manually by downloading installers and adding the programs to your system's PATH.
146
+
147
+ 1. **Install Tesseract OCR:**
148
+ * Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
149
+ * Run the installer.
150
+ * **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
151
+
152
+
153
+ 2. **Install Poppler:**
154
+ * Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-25.07.0-win.zip`).
155
+ * Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
156
+ * You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
157
+ * Search for "Edit the system environment variables" in the Windows Start Menu and open it.
158
+ * Click the "Environment Variables..." button.
159
+ * In the "System variables" section, find and select the `Path` variable, then click "Edit...".
160
+ * Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
161
+ * Click OK on all windows to save the changes.
162
+
163
+ To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
164
+ ---
165
+
166
+ #### **On Linux (Debian/Ubuntu)**
167
+
168
+ Open your terminal and run the following command to install Tesseract and Poppler:
169
+
170
+ ```bash
171
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
172
+ ```
173
+
174
+ #### **On Linux (Fedora/CentOS/RHEL)**
175
+
176
+ Open your terminal and use the `dnf` or `yum` package manager:
177
+
178
+ ```bash
179
+ sudo dnf install -y tesseract poppler-utils
180
+ ```
181
+ ---
182
+
183
+ ### 3. Run the Application
184
+
185
+ With all dependencies installed, you can now start the Gradio application GUI. For a guide on how to use this, please go [here](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html).
186
+
187
+ ```bash
188
+ python app.py
189
+ ```
190
+
191
+ After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
192
+
193
+ Open this URL in your web browser to use the document redaction tool
194
+
195
+ #### Command line interface
196
+
197
+ For example CLI commands, please refer to [this guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html#command-line-interface-cli) or the examples in [cli_redact.py](https://github.com/seanpedrick-case/doc_redaction/blob/main/cli_redact.py#L321)
198
+
199
+ If you installed from **PyPI**, use the installed console script:
200
+
201
+ ```bash
202
+ cli_redact --help
203
+ ```
204
+
205
+ From a **repository checkout**, you can also run:
206
+
207
+ ```bash
208
+ python cli_redact.py --help
209
+ ```
210
+
211
+ #### Python package commands
212
+
213
+ For Python examples in using the Python package, please see [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
214
+
215
+ ---
216
+
217
+
218
+ ### 4. ⚙️ Configuration (Optional)
219
+
220
+ You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
221
+
222
+ To get started:
223
+ 1. Copy `config/app_config.env.example` to `config/app_config.env`.
224
+ 2. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
225
+
226
+ If you do not create this file, the application will run with default settings.
227
+
228
+ #### Configuration Breakdown
229
+
230
+ Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
231
+
232
+ ---
233
+
234
+ #### **Local & General Settings (No AWS Required)**
235
+
236
+ These settings are useful for all users, regardless of whether you are using AWS.
237
+
238
+ * `TESSERACT_FOLDER` / `POPPLER_FOLDER`
239
+ * Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
240
+ * Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
241
+ * **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
242
+
243
+ * `SHOW_LANGUAGE_SELECTION=True`
244
+ * Set to `True` to display a language selection dropdown in the UI for OCR processing.
245
+
246
+ * `DEFAULT_LOCAL_OCR_MODEL=tesseract`"
247
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
248
+
249
+ * `SESSION_OUTPUT_FOLDER=False`
250
+ * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
251
+
252
+ * `DISPLAY_FILE_NAMES_IN_LOGS=False`
253
+ * For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
254
+
255
+ ---
256
+
257
+ #### **AWS-Specific Settings**
258
+
259
+ These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
260
+
261
+ * `RUN_AWS_FUNCTIONS=True`
262
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
263
+
264
+ * **UI Options:**
265
+ * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
266
+ * `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
267
+
268
+ * **Core AWS Configuration:**
269
+ * `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
270
+ * `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
271
+
272
+ * **AWS Logging:**
273
+ * `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
274
+ * `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
275
+
276
+ * **Advanced AWS Textract Features:**
277
+ * `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
278
+ * `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
279
+ * `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
280
+
281
+ * **Cost Tracking (for internal accounting):**
282
+ * `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
283
+ * `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
284
+ * `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
285
+ * `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
286
+
287
+ Now you have the app installed, please refer to the [User Guide](https://seanpedrick-case.github.io/doc_redaction/src/user_guide.html) for more information on how to use it for basic and advanced redaction.
288
+
289
+ ## For agents (API quickstart)
290
+
291
+ If you are an LLM/agent interacting with this app over HTTP (e.g. Hugging Face Spaces), **do not guess inputs** from the UI. Use the Gradio schema as the source of truth:
292
+
293
+ - **Discover schema**: `GET /gradio_api/info`
294
+ - **Upload files**: `POST /gradio_api/upload` (multipart field `files`) → returns server-internal paths like `/tmp/gradio_tmp/...`
295
+ - **Call**: `POST /gradio_api/call/{api_name}` with body `{"data":[...]}` (argument order must match `/gradio_api/info`)
296
+ - **Poll**: `GET /gradio_api/call/{api_name}/{event_id}` until complete
297
+ - **Download outputs**: `GET /gradio_api/file={path}` (note: some deployments return 403 without session cookies)
298
+
299
+ ### Choose the correct route (prefer short `gr.api` endpoints)
300
+
301
+ Fetch `/gradio_api/info` and then prefer the simplest route that exists:
302
+
303
+ - **Apply edited review CSV to a PDF**: `/review_apply`
304
+ - **Redact a PDF/image document**: `/doc_redact` — optional `handwrite_signature_checkbox` for AWS Textract (e.g. `Extract handwriting`, `Extract signatures`)
305
+ - **Summarise a PDF**: `/pdf_summarise`
306
+ - **Redact tabular files (CSV/XLSX/Parquet/DOCX)**: `/tabular_redact`
307
+
308
+ If those endpoints are not present in your deployment, fall back to the long UI-chained routes (`/apply_review_redactions`, `/redact_data`, etc.) and build `data[]` strictly from `/gradio_api/info`.
309
+
310
+ ### Common gotchas
311
+
312
+ - **Arity errors** (`needed: N, got: M`) mean you called a session-heavy UI handler with the wrong `data[]`. Prefer the short endpoints above.
313
+ - **`handle_file()` gotcha** (for `gradio_client` users): do **not** wrap server-internal upload paths (e.g. `/tmp/gradio_tmp/...`) with `handle_file()`. Pass them as plain strings.
314
+ - **Container-only outputs**: outputs may be written to container paths (e.g. `/home/user/app/output/`). Plan to download via `file=...` or use a mounted output directory in Docker.
315
+
316
+ ### Optional: MCP server
317
+
318
+ If you want external agents to call this app reliably without re-implementing Gradio upload/call/poll/download details, consider an **MCP server** that wraps the main tasks (`redact_document`, `apply_review_redactions`, `redact_tabular`, `summarise_document`) behind a small tool interface. See the [relevant documentation](https://github.com/seanpedrick-case/doc_redaction/blob/main/mcp_doc_redaction/README.md).
319
+
320
+ **Use as a library:** After installing from [PyPI](https://pypi.org/project/doc-redaction/) (`pip install doc_redaction`), you can call the same workflows as the Gradio `api_name` routes from Python. See the documentation: [Python Package usage (Python)](https://seanpedrick-case.github.io/doc_redaction/src/python_package_usage.html).
321
+
322
+ To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
323
+
324
+ For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
325
+
326
+ Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
327
+
328
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
agent-redact/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agent redaction (Pi)
2
+
3
+ Pi-based agentic document redaction: local Docker orchestration and Hugging Face Space packaging.
4
+
5
+ | Path | Purpose |
6
+ |------|---------|
7
+ | [`pi/`](pi/) | Gradio UI, Pi RPC client, remote redaction helpers, runtime config |
8
+ | [`pi-agent/`](pi-agent/) | HF Space Dockerfile, sync script, and manifest |
9
+ | [`requirements_pi_agent.txt`](requirements_pi_agent.txt) | Python deps for the Pi agent image |
10
+
11
+ Per-user output isolation uses Gradio `session_hash` subfolders under `PI_WORKSPACE_DIR` (see `agent-redact/pi/session_workspace.py`). Enabled by default locally and on HF Spaces. Set `PI_SESSION_WORKSPACE=false` only if you want one shared workspace tree for all sessions.
12
+
13
+ ## Local Docker
14
+
15
+ Use the `pi-agent` service in [`docker-compose_llama_agentic.yml`](../docker-compose_llama_agentic.yml) (profile `27b_36`). See [`pi/agent/README.md`](pi/agent/README.md).
16
+
17
+ ## Hugging Face Space
18
+
19
+ Build from repo root:
20
+
21
+ ```bash
22
+ docker build -f agent-redact/pi-agent/Dockerfile .
23
+ ```
24
+
25
+ Sync to Space on pushes to `dev` via [`.github/workflows/sync-pi-agent-space.yml`](../.github/workflows/sync-pi-agent-space.yml).
agent-redact/pi-agent/.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .github
3
+ **/__pycache__
4
+ **/*.pyc
5
+ **/.pytest_cache
6
+ **/node_modules
7
+ workspace
8
+ output
9
+ input
10
+ config/pi_agent.env
agent-redact/pi-agent/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Example PDFs must be plain files in the Space repo (not Git LFS pointers).
2
+ *.pdf -filter -diff -merge
agent-redact/pi-agent/Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ # Pi agent Gradio UI for Hugging Face Docker Space (remote doc_redaction backend).
3
+ # Build from monorepo root: docker build -f agent-redact/pi-agent/Dockerfile .
4
+
5
+ FROM node:22-bookworm-slim
6
+
7
+ ENV NODE_ENV=production
8
+ ENV DEBIAN_FRONTEND=noninteractive
9
+ ENV NPM_CONFIG_LOGLEVEL=warn
10
+ ENV PYTHONUNBUFFERED=1
11
+ ENV PYTHONDONTWRITEBYTECODE=1
12
+ ENV PYTHONPATH=/workspace/doc_redaction:/workspace/doc_redaction/agent-redact/pi
13
+ ENV PI_DEPLOYMENT_PROFILE=hf-space
14
+ ENV PI_DEFAULT_PROVIDER=google-gemini
15
+ ENV PI_DEFAULT_MODEL=gemini-flash-lite-latest
16
+ ENV DOC_REDACTION_GRADIO_URL=https://seanpedrickcase-document-redaction.hf.space
17
+ ENV GRADIO_SERVER_NAME=0.0.0.0
18
+ ENV GRADIO_SERVER_PORT=7860
19
+ ENV PI_WORKSPACE_DIR=/home/user/app/workspace
20
+ ENV PI_WORKDIR=/workspace/doc_redaction
21
+ ENV PI_UPLOAD_ROOT=/tmp/gradio
22
+ ENV PI_SESSION_DIR=/tmp/pi-sessions
23
+ ENV PI_OFFLINE=1
24
+ ENV PI_SKIP_VERSION_CHECK=1
25
+ ENV PI_GRADIO_SHOW_EXAMPLES=true
26
+ ENV HOME=/home/node
27
+
28
+ RUN apt-get update && apt-get install -y --no-install-recommends \
29
+ bash \
30
+ git \
31
+ curl \
32
+ ca-certificates \
33
+ procps \
34
+ python3 \
35
+ python3-pip \
36
+ python3-venv \
37
+ && rm -rf /var/lib/apt/lists/*
38
+
39
+ RUN npm install -g --ignore-scripts @earendil-works/pi-coding-agent
40
+
41
+ COPY agent-redact/requirements_pi_agent.txt /tmp/requirements_pi_agent.txt
42
+ RUN pip3 install --no-cache-dir --break-system-packages \
43
+ -r /tmp/requirements_pi_agent.txt \
44
+ && rm /tmp/requirements_pi_agent.txt
45
+
46
+ WORKDIR /workspace/doc_redaction
47
+
48
+ COPY agent-redact/pi agent-redact/pi
49
+ COPY skills skills
50
+ COPY tools tools
51
+ COPY config config
52
+ COPY intros intros
53
+ COPY AGENTS.md AGENTS.md
54
+ COPY doc_redaction/example_data doc_redaction/example_data
55
+
56
+ RUN test -f doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
57
+ && test -f doc_redaction/example_data/graduate-job-example-cover-letter.pdf \
58
+ && ! head -1 doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf \
59
+ | grep -q "^version https://git-lfs.github.com/spec/v1"
60
+
61
+ RUN mkdir -p /home/node/.pi/agent /home/user/app/workspace /tmp/gradio /tmp/pi-sessions \
62
+ && chown -R node:node /home/node/.pi /home/user/app /tmp/gradio /tmp/pi-sessions /workspace
63
+
64
+ USER node
65
+
66
+ RUN pi --version
67
+
68
+ EXPOSE 7860
69
+
70
+ CMD ["bash", "-c", "python3 agent-redact/pi/pi_agent_config.py && exec python3 agent-redact/pi/gradio_app.py"]
agent-redact/pi-agent/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Agentic Document Redaction
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: agpl-3.0
10
+ ---
11
+
12
+ # Pi agent — agentic document redaction
13
+
14
+ Orchestrate document redaction with **[Pi](https://github.com/earendil-works/pi)** and **Google Gemini**. Heavy redaction runs on a separate **private [doc_redaction](https://huggingface.co/spaces/seanpedrickcase/document_redaction)** Hugging Face Space (simple text extraction + Local PII).
15
+
16
+ ## Before you start
17
+
18
+ 1. **Gemini API key** — paste in **Agent backend** → **Apply backend** (session-only; not stored on disk).
19
+ 2. **HF token** — Space admin should set `HF_TOKEN` under **Settings → Secrets** so this Space can call the private redaction backend. Users may optionally override per session in the UI.
20
+
21
+ ## Limitations
22
+
23
+ - **No face or signature VLM** — text-layer PII only via Local spaCy/Presidio on the remote Space.
24
+ - **No Pass 2 VLM** on this deployment.
25
+ - **Ephemeral storage** — download deliverables from **Workspace output files** before the Space restarts.
26
+ - **Human review** — outputs are not guaranteed complete; review redacted PDFs before release.
27
+
28
+ ## Defaults
29
+
30
+ | Setting | Value |
31
+ |---------|--------|
32
+ | Pi LLM | Gemini (`gemini-flash-latest` default) |
33
+ | Redaction backend | `https://seanpedrickcase-document-redaction.hf.space` |
34
+ | Text extraction | `Local model - selectable text` |
35
+ | PII detection | `Local` |
36
+
37
+ ## Examples
38
+
39
+ Two sample PDFs load in **Redaction task** → **Try an example** (same demos as the main doc_redaction app). Examples are **on by default**; set Space variable `PI_GRADIO_SHOW_EXAMPLES=false` to hide them. (`SHOW_PI_EXAMPLES` is also accepted.)
40
+
41
+ If examples do not appear, the UI shows a short status message (usually missing PDFs in the image — rebuild after a successful sync with LFS materialization).
42
+
43
+ ## Development
44
+
45
+ This Space is synced from the [doc_redaction monorepo](https://github.com/seanpedrick-case/doc_redaction) on pushes to **`dev`** (see `.github/workflows/sync-pi-agent-space.yml`). Space: [seanpedrickcase/agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction).
agent-redact/pi-agent/sync-manifest.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paths copied from the monorepo root into the flattened Pi agent HF Space repo.
2
+ agent-redact/requirements_pi_agent.txt
3
+ agent-redact/pi
4
+ skills
5
+ tools
6
+ config/pi_agent.env.example
7
+ intros/pi_intro.txt
8
+ AGENTS.md
9
+ doc_redaction/example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
10
+ doc_redaction/example_data/graduate-job-example-cover-letter.pdf
agent-redact/pi-agent/sync_to_space.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Flatten monorepo paths into a temp directory for the Pi agent HF Space repo.
3
+ # Usage (from repo root):
4
+ # agent-redact/pi-agent/sync_to_space.sh /path/to/output-dir
5
+ set -euo pipefail
6
+
7
+ ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
8
+ OUT="${1:?Output directory required}"
9
+ MANIFEST="$(dirname "$0")/sync-manifest.txt"
10
+
11
+ _is_lfs_pointer() {
12
+ [[ -f "$1" ]] && head -1 "$1" 2>/dev/null | grep -q "^version https://git-lfs.github.com/spec/v1"
13
+ }
14
+
15
+ rm -rf "$OUT"
16
+ mkdir -p "$OUT"
17
+
18
+ cp "$(dirname "$0")/Dockerfile" "$OUT/Dockerfile"
19
+ cp "$(dirname "$0")/README.md" "$OUT/README.md"
20
+ cp "$(dirname "$0")/.dockerignore" "$OUT/.dockerignore"
21
+ cp "$(dirname "$0")/.gitattributes" "$OUT/.gitattributes"
22
+
23
+ while IFS= read -r line || [[ -n "$line" ]]; do
24
+ line="${line%%#*}"
25
+ line="$(echo "$line" | xargs)"
26
+ [[ -z "$line" ]] && continue
27
+ src="$ROOT/$line"
28
+ if [[ ! -e "$src" ]]; then
29
+ echo "Missing: $src" >&2
30
+ exit 1
31
+ fi
32
+ dest="$OUT/$line"
33
+ mkdir -p "$(dirname "$dest")"
34
+ cp -a "$src" "$dest"
35
+ if [[ "$line" == *.pdf ]] && _is_lfs_pointer "$dest"; then
36
+ echo "Copied file is a Git LFS pointer, not a PDF: $line" >&2
37
+ echo "Run 'git lfs pull' in the monorepo before syncing." >&2
38
+ exit 1
39
+ fi
40
+ done < "$MANIFEST"
41
+
42
+ echo "Flattened Pi agent Space tree: $OUT"
agent-redact/pi/agent/README.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pi agent config (Docker)
2
+
3
+ Runtime Pi config is **generated at container start** by [`agent-redact/pi/pi_agent_config.py`](../pi_agent_config.py) into `~/.pi/agent/models.json` and `~/.pi/agent/settings.json`.
4
+
5
+ Files in this folder (`settings.json`, `models.json`) are **templates/references** only — they are no longer bind-mounted into the container.
6
+
7
+ ## LLM backends (Pi orchestration)
8
+
9
+ The Pi agent (chat + redaction orchestration) can use:
10
+
11
+ | Provider key | Label | Pi API | Auth |
12
+ |--------------|-------|--------|------|
13
+ | `llama-cpp` | Local (llama-cpp) | `openai-completions` | None (local llama-inference) |
14
+ | `google-gemini` | Gemini | `google-generative-ai` | `GEMINI_API_KEY` or `GOOGLE_API_KEY` |
15
+ | `amazon-bedrock` | AWS Bedrock | `bedrock-converse-stream` | AWS SDK credentials (`AWS_ACCESS_KEY_ID`, etc.) |
16
+
17
+ This is separate from doc_redaction **Pass 2 VLM** (`{VLM_BASE_URL}` in redaction prompts), which still targets local llama-inference by default.
18
+
19
+ ### Environment variables
20
+
21
+ Copy [`config/pi_agent.env.example`](../../../config/pi_agent.env.example) to `config/pi_agent.env` (gitignored) or set on the host before `docker compose up`:
22
+
23
+ | Variable | Purpose |
24
+ |----------|---------|
25
+ | `PI_DEFAULT_PROVIDER` | `llama-cpp` \| `google-gemini` \| `amazon-bedrock` |
26
+ | `PI_DEFAULT_MODEL` | Model id within provider |
27
+ | `PI_LLAMA_BASE_URL` | Local OpenAI-compatible URL (default `http://llama-inference:8080/v1`) |
28
+ | `PI_LLAMA_MODEL_ID` | Local model id |
29
+ | `GEMINI_API_KEY` / `GOOGLE_API_KEY` | Gemini API key |
30
+ | `AWS_REGION` / `AWS_DEFAULT_REGION` | Bedrock region |
31
+ | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN` | Bedrock credentials (when not using SSO) |
32
+ | `AWS_PROFILE` | Named profile for SSO / shared credentials file (**required for Pi Bedrock with SSO**) |
33
+ | `PI_AWS_PROFILE` | Alternative to `AWS_PROFILE`; also used to auto-select profile when only `~/.aws` is mounted |
34
+ | `RUN_AWS_FUNCTIONS` | When `True`, use the AWS default credential chain (SSO, profile, role) |
35
+ | `PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS` | When `True` with `RUN_AWS_FUNCTIONS`, prefer SSO/chain over static env keys (default `True`, same as main app) |
36
+ | `PI_MAX_PAGES` | Maximum PDF pages allowed per redaction upload (falls back to `MAX_PAGES` / `MAX_DOC_PAGES`, default `3000`) |
37
+ | `PI_MAX_RETRIES` | Gemini quota / rate-limit retries for Pi auto-retry and Gradio backoff (default `5`; alias `PI_QUOTA_RETRY_ATTEMPTS`) |
38
+ | `PI_QUOTA_RETRY_DELAY_S` | Seconds between Gradio quota retries (default `60`) |
39
+ | `PI_COMPACTION_ENABLED` | Pi session auto-compaction in `settings.json` (`true` / `false`; unset uses template default, enabled) |
40
+ | `PI_COMPACTION_RESERVE_TOKENS` | Optional compaction `reserveTokens` (default `32768` from template) |
41
+ | `PI_COMPACTION_KEEP_RECENT_TOKENS` | Optional compaction `keepRecentTokens` (default `20000` from template) |
42
+
43
+ ### Usage logging (CSV / DynamoDB / S3)
44
+
45
+ Each completed Pi agent run (chat message or redaction task) writes **one row** to the **same usage log schema** as the main redaction app (`USAGE_LOG_FILE_NAME`, `USAGE_LOGS_FOLDER`, `S3_USAGE_LOGS_FOLDER`, `USAGE_LOG_DYNAMODB_TABLE_NAME`). Key fields:
46
+
47
+ | Log column | Pi agent value |
48
+ |------------|----------------|
49
+ | `task` | `agent` |
50
+ | `llm_model_name` | Pi provider/model (e.g. `amazon-bedrock/anthropic.claude-sonnet-4-6`) |
51
+ | `text_extraction_method` / `pii_detection_method` | From redaction task settings when applicable |
52
+ | `actual_time_taken_number` | Wall-clock seconds for the Pi RPC turn |
53
+ | `total_page_count` | Pages in scope for PDF redaction tasks |
54
+ | `llm_total_input_tokens` / `llm_total_output_tokens` | Pi orchestration LLM usage for that turn (from Pi `get_session_stats` delta, or assistant `usage` in session JSONL). Includes cache read/write in the input column. **VLM/tokens from doc_redaction Pass 1 are not included** (those stay on the main app usage log when you run redaction there directly). |
55
+
56
+ Toggle with `SAVE_LOGS_TO_CSV`, `SAVE_LOGS_TO_DYNAMODB`, and `RUN_AWS_FUNCTIONS` (required for S3 log upload). Access logs on session load use the main app access log paths separately.
57
+
58
+ At startup, if only `GOOGLE_API_KEY` is set, it is mirrored to `GEMINI_API_KEY` for Pi.
59
+
60
+ ### Gradio UI
61
+
62
+ Open **http://localhost:7862** → **Agent backend** accordion:
63
+
64
+ - Select provider and model
65
+ - Optionally enter Gemini / AWS credentials (**session-only** — not written to disk)
66
+ - Click **Apply backend** — regenerates config, restarts the Pi RPC subprocess, and starts a new session
67
+
68
+ Credential fields are cleared after apply.
69
+
70
+ ## Local model id
71
+
72
+ After the llama.cpp service is healthy, confirm the model id:
73
+
74
+ ```bash
75
+ curl http://localhost:8000/v1/models
76
+ ```
77
+
78
+ If the returned `id` differs from `unsloth/Qwen3.6-27B-MTP-GGUF`, set `PI_LLAMA_MODEL_ID` in `config/pi_agent.env` or compose environment and restart `pi-agent`.
79
+
80
+ ## In-container URLs for task prompts
81
+
82
+ When filling [`skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md`](../../../skills/doc-redaction-task-prompt/TASK_PROMPT_TEMPLATE.md) inside the Pi container, use:
83
+
84
+ | Placeholder | In-container value |
85
+ |-------------|-------------------|
86
+ | `{GRADIO_URL}` | `http://redaction-app-llama:7860` |
87
+ | `{VLM_BASE_URL}` | `http://llama-inference:8080` |
88
+ | `{INPUT_PATH}` | `/home/user/app/workspace/{session_hash}/{FILE_NAME}` (when `PI_SESSION_WORKSPACE=true`) |
89
+ | `{OUTPUT_BASE}` | `/home/user/app/workspace/{session_hash}/redact/{FILE_NAME}/` |
90
+
91
+ Host-side examples (`host.docker.internal`, `localhost:7861`) do not apply inside the compose network.
92
+
93
+ ## Usage
94
+
95
+ Start the stack (27B profile):
96
+
97
+ ```powershell
98
+ docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 up -d --build
99
+ ```
100
+
101
+ Interactive Pi TUI:
102
+
103
+ ```powershell
104
+ docker compose -f docker-compose_llama_agentic.yml exec -it pi-agent pi
105
+ ```
106
+
107
+ Gradio chat UI (browser):
108
+
109
+ Open **http://localhost:7862**. Use the **Redaction task** panel to upload a document, enter bullet-point requirements, and click **Start redaction task**. Pi receives the filled prompt from [`skills/Example prompt partnership.txt`](../../../skills/Example%20prompt%20partnership.txt) (file copied to `/home/user/app/workspace/`). The full prompt appears in the chat; Pi’s reply streams in the chat panel.
110
+
111
+ The UI also shows:
112
+
113
+ - **Agent backend** — switch between local, Gemini, and Bedrock
114
+ - **Chat** — streamed assistant text
115
+ - **Activity** — agent/turn lifecycle, compaction, auto-retry, tool start/end
116
+ - **Tool output** — live bash/read output from `tool_execution_update` / `tool_execution_end`
117
+ - **Thinking** — optional stream (`PI_GRADIO_SHOW_THINKING=true`)
118
+ - **Abort** — sends Pi RPC `abort` and cancels the in-flight Gradio handler
119
+ - **Workspace output files** — browse and download redaction artifacts
120
+
121
+ Optional env vars on `pi-agent`: `PI_GRADIO_SHOW_THINKING`, `PI_GRADIO_SHOW_TOOL_OUTPUT`, `PI_GRADIO_TOOL_OUTPUT_MAX`, `PI_GRADIO_ACTIVITY_MAX_LINES`.
122
+
123
+ When a Pi run completes, the chat shows an **Agent finished** (or **Agent stopped**) line, a Gradio info toast appears, and the browser tab title flashes for ~15 seconds. Desktop notifications are shown when the browser has granted notification permission (requested on first click/keypress in the Pi UI).
124
+
125
+ Run the UI locally (outside Docker):
126
+
127
+ ```powershell
128
+ cd agent-redact/pi
129
+ pip install -r ../requirements_pi_agent.txt
130
+ # Pi orchestration subprocess (required for Apply backend / chat):
131
+ npm install -g @earendil-works/pi-coding-agent
132
+ python pi_agent_config.py
133
+ python gradio_app.py
134
+ ```
135
+
136
+ **Apply backend** starts `pi --mode rpc`. If you see `FileNotFoundError` / “Pi CLI not found”, install Node.js, run the `npm install` line above, and ensure `pi` (or `pi.cmd` on Windows) is on `PATH`. Optional: `PI_EXECUTABLE=C:\Users\you\AppData\Roaming\npm\pi.cmd` in `config/pi_agent.env`.
137
+
138
+ RPC mode (automation, no Gradio):
139
+
140
+ ```powershell
141
+ docker compose -f docker-compose_llama_agentic.yml exec -T pi-agent pi --mode rpc
142
+ ```
143
+
144
+ Skills are synced from the repo `skills/` tree into **`{PI_WORKSPACE_DIR}/.pi/skills/`** on startup (read-only). Pi runs with `cwd` in the user’s session subfolder and `--no-skills` so it does not load skills from the git checkout. Use `/skill:doc-redaction-app` etc. Set `PI_SKILLS_RESYNC=true` to refresh copies from the repo.
145
+
146
+ Sessions persist in the **`pi-agent-sessions`** Docker volume at **`~/.pi/agent/sessions/`** (Pi’s default session location inside the container). Override with `PI_SESSION_DIR` if needed.
147
+
148
+ On **HF Space** (`PI_DEPLOYMENT_PROFILE=hf-space`), sessions go to **`/tmp/pi-sessions`** instead (ephemeral; lost on restart).
149
+
150
+ ## Python dependencies
151
+
152
+ The Pi image installs [`requirements_pi_agent.txt`](../requirements_pi_agent.txt) — Gradio UI + `gradio-client`, HTTP clients, CSV/PDF review helpers (`pandas`, `pymupdf`), and common utilities. It **does not** include spaCy, Presidio, or OCR; heavy redaction runs in `redaction-app-llama`.
153
+
154
+ Rebuild after changing that file:
155
+
156
+ ```powershell
157
+ docker compose -f docker-compose_llama_agentic.yml --profile 27b_36 build pi-agent
158
+ ```
159
+
160
+ ## HF Space profile (remote redaction backend)
161
+
162
+ Set `PI_DEPLOYMENT_PROFILE=hf-space` to run the Pi Gradio UI as a **Hugging Face Docker Space** that orchestrates with **Gemini only** and calls a **remote** doc_redaction Space over HTTPS.
163
+
164
+ | Area | HF Space value |
165
+ |------|----------------|
166
+ | Pi LLM | Gemini only (`PI_DEFAULT_PROVIDER=google-gemini`) |
167
+ | Redaction app | `DOC_REDACTION_GRADIO_URL` (default `https://seanpedrickcase-document-redaction.hf.space`) |
168
+ | Auth to redaction | `HF_TOKEN` / `DOC_REDACTION_HF_TOKEN` (Space secret + optional UI override) |
169
+ | Text extraction / PII | Locked to `Local model - selectable text` + `Local` |
170
+ | VLM faces / signatures | Disabled |
171
+ | Port | `7860` |
172
+ | Pi session logs | `/tmp/pi-sessions` (`PI_SESSION_DIR`; ephemeral) |
173
+
174
+ Package and Dockerfile: [`agent-redact/pi-agent/`](../../pi-agent/). Pushes to [agentic_document_redaction](https://huggingface.co/spaces/seanpedrickcase/agentic_document_redaction) on **`dev`** branch via [`.github/workflows/sync-pi-agent-space.yml`](../../../.github/workflows/sync-pi-agent-space.yml) (GitHub secrets: `HF_TOKEN`, `HF_USERNAME`, `HF_EMAIL`).
175
+
176
+ Local build test from monorepo root:
177
+
178
+ ```powershell
179
+ docker build -f agent-redact/pi-agent/Dockerfile -t pi-agent-hf-space .
180
+ docker run --rm -p 7860:7860 -e GEMINI_API_KEY=... -e HF_TOKEN=... pi-agent-hf-space
181
+ ```
182
+
183
+ Pi uses `gradio_client` + `agent-redact/pi/remote_redaction.py` to upload/download from the remote Space; prompts include `{REMOTE_BACKEND_GUIDANCE}` (see [`redaction_prompt.py`](../redaction_prompt.py)).
agent-redact/pi/agent/models.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "providers": {
3
+ "llama-cpp": {
4
+ "baseUrl": "http://llama-inference:8080/v1",
5
+ "api": "openai-completions",
6
+ "apiKey": "llama-cpp",
7
+ "compat": {
8
+ "supportsDeveloperRole": false,
9
+ "supportsReasoningEffort": false,
10
+ "supportsUsageInStreaming": false,
11
+ "maxTokensField": "max_tokens"
12
+ },
13
+ "models": [
14
+ {
15
+ "id": "unsloth/Qwen3.6-27B-MTP-GGUF",
16
+ "name": "Qwen 3.6 27B (local)",
17
+ "reasoning": false,
18
+ "input": ["text", "image"],
19
+ "contextWindow": 114688,
20
+ "maxTokens": 32768,
21
+ "cost": {
22
+ "input": 0,
23
+ "output": 0,
24
+ "cacheRead": 0,
25
+ "cacheWrite": 0
26
+ }
27
+ }
28
+ ]
29
+ }
30
+ }
31
+ }
agent-redact/pi/agent/settings.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "defaultProvider": "llama-cpp",
3
+ "defaultModel": "unsloth/Qwen3.6-27B-MTP-GGUF",
4
+ "defaultThinkingLevel": "off",
5
+ "hideThinkingBlock": true,
6
+ "compaction": {
7
+ "enabled": true,
8
+ "reserveTokens": 32768,
9
+ "keepRecentTokens": 20000
10
+ },
11
+ "branchSummary": {
12
+ "skipPrompt": true,
13
+ "reserveTokens": 32768
14
+ },
15
+ "retry": {
16
+ "enabled": true,
17
+ "maxRetries": 5,
18
+ "baseDelayMs": 2000,
19
+ "provider": {
20
+ "timeoutMs": 3600000,
21
+ "maxRetries": 5,
22
+ "maxRetryDelayMs": 60000
23
+ }
24
+ },
25
+ "enableSkillCommands": true,
26
+ "sessionDir": "sessions",
27
+ "steeringMode": "one-at-a-time",
28
+ "followUpMode": "one-at-a-time",
29
+ "terminal": {
30
+ "showTerminalProgress": false
31
+ }
32
+ }
agent-redact/pi/bootstrap_pi_config.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pi agent process bootstrap (env file + workspace) before ``tools.config`` import."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ _DOCKER_WORKSPACE = Path("/home/user/app/workspace")
11
+ _DOCKER_UPLOAD_ROOT = Path("/tmp/gradio")
12
+ _DOCKER_PI_WORKDIR = Path("/workspace/doc_redaction")
13
+ _PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
14
+
15
+
16
+ def _pi_running_in_container() -> bool:
17
+ """
18
+ True when the Pi process is inside Docker / HF Space, not local Windows dev.
19
+
20
+ Avoids treating ``C:\\home\\user\\app\\workspace`` (created by mistake on Windows)
21
+ as the compose mount.
22
+ """
23
+ if Path("/.dockerenv").is_file():
24
+ return True
25
+ return _DOCKER_PI_WORKDIR.is_dir() and _partnership_template_exists(
26
+ _DOCKER_PI_WORKDIR
27
+ )
28
+
29
+
30
+ def ensure_pi_workspace_dir(repo_root: Path | None = None) -> str:
31
+ """
32
+ Resolve ``PI_WORKSPACE_DIR``, create it, and sync ``os.environ``.
33
+
34
+ - Explicit ``PI_WORKSPACE_DIR`` wins.
35
+ - Else use the Docker mount only when running in a container.
36
+ - Else ``{repo_root}/workspace`` (local Windows/macOS/Linux dev).
37
+ """
38
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
39
+ raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
40
+ if raw:
41
+ path = Path(raw)
42
+ elif _pi_running_in_container() and _DOCKER_WORKSPACE.is_dir():
43
+ path = _DOCKER_WORKSPACE
44
+ else:
45
+ path = root / "workspace"
46
+ path.mkdir(parents=True, exist_ok=True)
47
+ resolved = str(path.resolve())
48
+ os.environ["PI_WORKSPACE_DIR"] = resolved
49
+ return resolved
50
+
51
+
52
+ def ensure_pi_upload_root(repo_root: Path | None = None) -> str:
53
+ """
54
+ Resolve where Gradio stores ``gr.File`` uploads and sync ``os.environ``.
55
+
56
+ Must run before ``import gradio`` so ``GRADIO_TEMP_DIR`` matches validation
57
+ in ``redaction_prompt._resolve_and_validate_upload_path``.
58
+
59
+ - Explicit ``PI_UPLOAD_ROOT`` wins.
60
+ - Else ``GRADIO_TEMP_DIR`` if already set.
61
+ - Else Docker ``/tmp/gradio`` when that directory exists.
62
+ - Else ``{repo}/workspace/.gradio_uploads`` (local dev; stays inside the app tree
63
+ so ``tools.config.ensure_folder_within_app_directory`` accepts ``GRADIO_TEMP_DIR``).
64
+ """
65
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
66
+ raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
67
+ if raw:
68
+ path = Path(raw)
69
+ else:
70
+ gradio_temp = (os.environ.get("GRADIO_TEMP_DIR") or "").strip()
71
+ if gradio_temp:
72
+ path = Path(gradio_temp)
73
+ elif _pi_running_in_container() and _DOCKER_UPLOAD_ROOT.is_dir():
74
+ path = _DOCKER_UPLOAD_ROOT
75
+ else:
76
+ path = root / "workspace" / ".gradio_uploads"
77
+ path.mkdir(parents=True, exist_ok=True)
78
+ resolved = str(path.resolve())
79
+ os.environ["PI_UPLOAD_ROOT"] = resolved
80
+ if not (os.environ.get("GRADIO_TEMP_DIR") or "").strip():
81
+ os.environ["GRADIO_TEMP_DIR"] = resolved
82
+ return resolved
83
+
84
+
85
+ def _partnership_template_exists(repo: Path) -> bool:
86
+ return (repo / _PARTNERSHIP_TEMPLATE).is_file()
87
+
88
+
89
+ def ensure_pi_workdir(repo_root: Path | None = None) -> str:
90
+ """
91
+ Resolve ``PI_WORKDIR`` (monorepo root for skills/ and Pi RPC cwd).
92
+
93
+ - Explicit ``PI_WORKDIR`` wins when the partnership prompt template exists there.
94
+ - Else use the checkout root (``agent-redact/pi`` → parents[2]).
95
+ - Docker images set ``PI_WORKDIR=/workspace/doc_redaction`` via env or ``start.sh``.
96
+ """
97
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
98
+ raw = (os.environ.get("PI_WORKDIR") or "").strip()
99
+ if raw:
100
+ candidate = Path(raw)
101
+ if _partnership_template_exists(candidate):
102
+ resolved = str(candidate.resolve())
103
+ os.environ["PI_WORKDIR"] = resolved
104
+ return resolved
105
+ if _pi_running_in_container() and _partnership_template_exists(_DOCKER_PI_WORKDIR):
106
+ resolved = str(_DOCKER_PI_WORKDIR.resolve())
107
+ os.environ["PI_WORKDIR"] = resolved
108
+ return resolved
109
+ resolved = str(root)
110
+ os.environ["PI_WORKDIR"] = resolved
111
+ return resolved
112
+
113
+
114
+ def pi_repo_root_path(repo_root: Path | None = None) -> Path:
115
+ """Return ``PI_WORKDIR`` as a :class:`~pathlib.Path` (calls :func:`ensure_pi_workdir`)."""
116
+ return Path(ensure_pi_workdir(repo_root))
117
+
118
+
119
+ def load_pi_agent_env_file(config_path: str | Path | None = None) -> bool:
120
+ """
121
+ Load ``config/pi_agent.env`` into ``os.environ`` (does not override existing vars).
122
+
123
+ Must run before ``import pi_agent_config`` so module-level defaults see the file.
124
+ """
125
+ path = Path(config_path or os.environ.get("APP_CONFIG_PATH", "")).expanduser()
126
+ if not path.is_file():
127
+ return False
128
+ load_dotenv(path, override=False)
129
+ return True
130
+
131
+
132
+ def ensure_pi_config_env(repo_root: Path | None = None) -> str:
133
+ """
134
+ Set process env so ``tools.config`` loads the Pi agent env file.
135
+
136
+ Must run before any ``from pi_agent_config import ...`` or ``tools.config`` import
137
+ that depends on Pi env vars. Safe to call multiple times; does not override
138
+ existing environment variables.
139
+ """
140
+ root = (repo_root or Path(__file__).resolve().parents[2]).resolve()
141
+ os.environ.setdefault("APP_TYPE", "pi")
142
+ if not os.environ.get("APP_CONFIG_PATH", "").strip():
143
+ os.environ["APP_CONFIG_PATH"] = str(root / "config" / "pi_agent.env")
144
+ load_pi_agent_env_file()
145
+ ensure_pi_workdir(root)
146
+ ensure_pi_workspace_dir(root)
147
+ ensure_pi_upload_root(root)
148
+ from pi_workspace_skills import ensure_workspace_skills
149
+
150
+ ensure_workspace_skills()
151
+ return os.environ["APP_CONFIG_PATH"]
agent-redact/pi/gradio_app.py ADDED
@@ -0,0 +1,1769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio chat UI for Pi (RPC mode).
4
+
5
+ Streams Pi RPC events into a chatbot, activity log, tool output panel, and
6
+ optional thinking trace. Includes a redaction task panel driven by the
7
+ partnership prompt template.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import sys
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from fastapi import FastAPI
19
+
20
+ _REPO_ROOT = Path(__file__).resolve().parents[2]
21
+ if str(_REPO_ROOT) not in sys.path:
22
+ sys.path.insert(0, str(_REPO_ROOT))
23
+
24
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
25
+
26
+ from bootstrap_pi_config import ensure_pi_config_env
27
+
28
+ ensure_pi_config_env(_REPO_ROOT)
29
+
30
+ import gradio as gr
31
+ from output_files import (
32
+ collect_final_output_files,
33
+ gradio_allowed_paths,
34
+ refresh_workspace_output_files_stub,
35
+ refresh_workspace_panel,
36
+ workspace_files_download_fn,
37
+ )
38
+ from pi_agent_config import (
39
+ apply_session_credentials,
40
+ configure_aws_credentials,
41
+ credential_status_markdown,
42
+ default_model_for_provider,
43
+ gemini_api_key_configured,
44
+ get_default_provider,
45
+ is_hf_space_profile,
46
+ mirror_hf_token_from_env,
47
+ models_for_provider,
48
+ normalize_provider,
49
+ provider_choices,
50
+ provider_label,
51
+ resolved_default_model,
52
+ write_runtime_config,
53
+ )
54
+ from pi_examples import example_rows, examples_status_markdown
55
+ from pi_rpc_client import (
56
+ PiRpcClient,
57
+ PiRpcError,
58
+ PiStreamEvent,
59
+ assistant_text_since_last_user,
60
+ default_client,
61
+ is_rate_limit_error,
62
+ last_assistant_turn_error,
63
+ )
64
+ from redaction_prompt import (
65
+ DEFAULT_OCR_METHOD,
66
+ DEFAULT_PII_METHOD,
67
+ OCR_METHOD_CHOICES,
68
+ PII_METHOD_CHOICES,
69
+ RedactionTaskSettings,
70
+ pages_to_process_count,
71
+ pdf_page_count,
72
+ prepare_redaction_task,
73
+ )
74
+ from session_logs import collect_session_log_download, persist_session_log
75
+
76
+ # Before any ``tools.config`` import (e.g. session_workspace): compose may inject
77
+ # empty AWS_REGION= which would freeze a blank region in tools.config.AWS_REGION.
78
+ mirror_hf_token_from_env()
79
+ configure_aws_credentials()
80
+
81
+ from pi_session_usage import resolve_session_token_usage, usage_for_completed_turn
82
+ from session_workspace import (
83
+ init_session_workspace,
84
+ prepare_session_workspace,
85
+ session_workspace_dir,
86
+ workspace_base_dir,
87
+ workspace_context_prefix,
88
+ )
89
+
90
+ from tools.aws_functions import export_outputs_to_s3, s3_outputs_upload_ready
91
+ from tools.config import (
92
+ ACTIVITY_MAX_LINES,
93
+ EMPTY_SEND_WITH_FILE_HINT,
94
+ HOST_NAME,
95
+ PI_GRADIO_PORT,
96
+ PI_INTRO_TEXT,
97
+ PI_UI_HOST,
98
+ PI_UI_TITLE,
99
+ QUOTA_CONTINUE_PROMPT,
100
+ QUOTA_RETRY_ATTEMPTS,
101
+ QUOTA_RETRY_DELAY_S,
102
+ RUN_FASTAPI,
103
+ SAVE_OUTPUTS_TO_S3,
104
+ SHOW_THINKING,
105
+ SHOW_TOOL_OUTPUT,
106
+ THINKING_DISPLAY_MAX,
107
+ THINKING_PANEL_CSS,
108
+ TOOL_OUTPUT_MAX,
109
+ )
110
+ from tools.gradio_platform import (
111
+ create_fastapi_app,
112
+ log_agent_usage_event,
113
+ log_platform_access,
114
+ mount_or_launch,
115
+ )
116
+
117
+ IS_HF_SPACE = is_hf_space_profile()
118
+ # Use PI_GRADIO_PORT only — GRADIO_SERVER_PORT is the main app's default (7860) and is
119
+ # written into os.environ during tools.config import, which would override 7862 here.
120
+ PI_UI_PORT = PI_GRADIO_PORT
121
+
122
+ AGENT_FINISH_SIGNAL_NONE = ""
123
+ AGENT_FINISH_SIGNAL_FINISHED = "finished"
124
+ AGENT_FINISH_SIGNAL_ABORTED = "aborted"
125
+ AGENT_FINISH_SIGNAL_ERROR = "error"
126
+
127
+ PI_AGENT_FINISH_HEAD_HTML = """
128
+ <script>
129
+ (function () {
130
+ function requestNotificationPermissionOnce() {
131
+ if (typeof Notification === "undefined") return;
132
+ if (Notification.permission !== "default") return;
133
+ try { Notification.requestPermission(); } catch (e) {}
134
+ }
135
+ document.addEventListener("click", requestNotificationPermissionOnce, { once: true });
136
+ document.addEventListener("keydown", requestNotificationPermissionOnce, { once: true });
137
+ })();
138
+ </script>
139
+ """
140
+
141
+ PI_AGENT_FINISH_NOTIFY_JS = """
142
+ async (...outputs) => {
143
+ const finishSignal = outputs[outputs.length - 1];
144
+ if (!finishSignal) {
145
+ return outputs;
146
+ }
147
+ const isAborted = finishSignal === "aborted";
148
+ const isError = finishSignal === "error";
149
+ const title = isAborted ? "Agent stopped" : (isError ? "Agent error" : "Agent finished");
150
+ const body = isAborted
151
+ ? "The Pi agent run was aborted."
152
+ : (isError
153
+ ? "The Pi agent run ended with an error."
154
+ : "The Pi agent has finished its task. Review the chat for results.");
155
+ const originalTitle = document.title;
156
+ let flashOn = true;
157
+ const flashInterval = setInterval(() => {
158
+ document.title = flashOn ? ("✓ " + title) : originalTitle;
159
+ flashOn = !flashOn;
160
+ }, 1000);
161
+ setTimeout(() => {
162
+ clearInterval(flashInterval);
163
+ document.title = originalTitle;
164
+ }, 15000);
165
+ if (typeof Notification !== "undefined") {
166
+ try {
167
+ if (Notification.permission === "granted") {
168
+ new Notification(title, { body: body, tag: "pi-agent-finish" });
169
+ } else if (Notification.permission === "default") {
170
+ const perm = await Notification.requestPermission();
171
+ if (perm === "granted") {
172
+ new Notification(title, { body: body, tag: "pi-agent-finish" });
173
+ }
174
+ }
175
+ } catch (e) {}
176
+ }
177
+ outputs[outputs.length - 1] = "";
178
+ return outputs;
179
+ }
180
+ """
181
+
182
+ app = None
183
+
184
+
185
+ def _agent_finish_chat_notice(*, aborted: bool = False, error: bool = False) -> str:
186
+ if aborted:
187
+ return (
188
+ "---\n\n"
189
+ "**Agent stopped** — the run was aborted. You can send a follow-up message "
190
+ "or start a new task."
191
+ )
192
+ if error:
193
+ return (
194
+ "---\n\n"
195
+ "**Agent stopped** — the run ended with an error. Review the activity log "
196
+ "and send a follow-up if needed."
197
+ )
198
+ return (
199
+ "---\n\n"
200
+ "**Agent finished** — the task is complete. Review the outputs below or send "
201
+ "a follow-up message if you need changes."
202
+ )
203
+
204
+
205
+ def _show_agent_finish_toast(*, aborted: bool = False, error: bool = False) -> None:
206
+ try:
207
+ if aborted:
208
+ gr.Info("Agent stopped (aborted).", duration=8)
209
+ elif error:
210
+ gr.Info("Agent stopped with an error.", duration=8)
211
+ else:
212
+ gr.Info("Agent finished — task complete.", duration=8)
213
+ except Exception:
214
+ pass
215
+
216
+
217
+ def _agent_finish_signal_value(*, aborted: bool = False, error: bool = False) -> str:
218
+ if error:
219
+ return AGENT_FINISH_SIGNAL_ERROR
220
+ if aborted:
221
+ return AGENT_FINISH_SIGNAL_ABORTED
222
+ return AGENT_FINISH_SIGNAL_FINISHED
223
+
224
+
225
+ def _notify_agent_finished(*, aborted: bool = False, error: bool = False) -> str:
226
+ """Show Gradio toast and return browser-notify signal for the finish handler."""
227
+ _show_agent_finish_toast(aborted=aborted, error=error)
228
+ return _agent_finish_signal_value(aborted=aborted, error=error)
229
+
230
+
231
+ def _append_agent_finish_notice(
232
+ history: list[dict[str, Any]],
233
+ completed_segments: list[str],
234
+ streaming_text: str,
235
+ *,
236
+ aborted: bool = False,
237
+ error: bool = False,
238
+ ) -> tuple[list[dict[str, Any]], list[str], str]:
239
+ note = _agent_finish_chat_notice(aborted=aborted, error=error)
240
+ completed_segments, streaming_text = _append_chat_segment(
241
+ completed_segments, streaming_text, note
242
+ )
243
+ if history and history[-1].get("role") == "assistant":
244
+ history[-1]["content"] = _assistant_display_text(
245
+ completed_segments, streaming_text
246
+ )
247
+ return history, completed_segments, streaming_text
248
+
249
+
250
+ def _passthrough_chat_outputs(*outputs: Any) -> tuple[Any, ...]:
251
+ """Passthrough for ``.then(js=...)`` — Gradio forces ``queue=False`` when ``fn is None``."""
252
+ return outputs
253
+
254
+
255
+ def _client_provider_model(client: PiRpcClient | None) -> tuple[str, str]:
256
+ if client is None:
257
+ return "", ""
258
+ try:
259
+ state = client.get_state()
260
+ except PiRpcError:
261
+ return "", ""
262
+ model = state.get("model") or {}
263
+ provider = str(model.get("provider") or state.get("provider") or "")
264
+ model_label = str(model.get("id") or model.get("name") or "")
265
+ return provider, model_label
266
+
267
+
268
+ def _llm_model_label(client: PiRpcClient | None) -> str:
269
+ provider, model = _client_provider_model(client)
270
+ if provider and model:
271
+ return f"{provider}/{model}"
272
+ return model or provider
273
+
274
+
275
+ def _after_pi_task(
276
+ *,
277
+ session_hash: str,
278
+ client: PiRpcClient | None,
279
+ s3_output_folder: str,
280
+ save_outputs_to_s3: bool,
281
+ document_name: str = "",
282
+ started_at: float | None = None,
283
+ base_file: str | None = None,
284
+ ocr_method: str = "",
285
+ pii_method: str = "",
286
+ total_page_count: int = 0,
287
+ vlm_model_name: str | None = None,
288
+ llm_input_tokens: int = 0,
289
+ llm_output_tokens: int = 0,
290
+ ) -> None:
291
+ duration = round(time.time() - started_at, 2) if started_at else ""
292
+ log_agent_usage_event(
293
+ session_hash=session_hash,
294
+ duration_seconds=duration,
295
+ document_name=document_name,
296
+ total_page_count=total_page_count,
297
+ ocr_method=ocr_method,
298
+ pii_method=pii_method,
299
+ llm_model_name=_llm_model_label(client),
300
+ vlm_model_name=vlm_model_name or os.environ.get("PI_VLM_MODEL", ""),
301
+ llm_input_tokens=llm_input_tokens,
302
+ llm_output_tokens=llm_output_tokens,
303
+ task="agent",
304
+ )
305
+ persist_session_log(client, session_hash=session_hash)
306
+ file_paths = collect_final_output_files(session_hash)
307
+ if (
308
+ file_paths
309
+ and s3_output_folder
310
+ and s3_outputs_upload_ready(save_outputs_to_s3=save_outputs_to_s3)
311
+ ):
312
+ export_outputs_to_s3(
313
+ file_paths,
314
+ s3_output_folder,
315
+ save_outputs_to_s3,
316
+ base_file,
317
+ )
318
+
319
+
320
+ def _export_workspace_outputs(
321
+ session_hash: str,
322
+ s3_output_folder: str,
323
+ save_outputs_to_s3: bool,
324
+ base_file: str | None = None,
325
+ ) -> None:
326
+ file_paths = collect_final_output_files(session_hash)
327
+ if (
328
+ file_paths
329
+ and s3_output_folder
330
+ and s3_outputs_upload_ready(save_outputs_to_s3=save_outputs_to_s3)
331
+ ):
332
+ export_outputs_to_s3(
333
+ file_paths,
334
+ s3_output_folder,
335
+ save_outputs_to_s3,
336
+ base_file,
337
+ )
338
+
339
+
340
+ def _clone_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
341
+ return [{"role": item["role"], "content": item["content"]} for item in history]
342
+
343
+
344
+ def _truncate_thinking(text: str, limit: int = THINKING_DISPLAY_MAX) -> str:
345
+ if len(text) <= limit:
346
+ return text
347
+ hidden = len(text) - limit
348
+ return f"… [{hidden:,} earlier chars hidden]\n\n{text[-limit:]}"
349
+
350
+
351
+ def _assistant_display_text(completed_segments: list[str], current: str) -> str:
352
+ parts = [segment.strip() for segment in completed_segments if segment.strip()]
353
+ if current.strip():
354
+ parts.append(current.strip())
355
+ return "\n\n".join(parts)
356
+
357
+
358
+ def _finalize_assistant_chat(
359
+ client: PiRpcClient,
360
+ history: list[dict[str, Any]],
361
+ *,
362
+ completed_segments: list[str],
363
+ streaming_text: str,
364
+ activity: list[str],
365
+ ) -> None:
366
+ """Fill an empty assistant bubble after tool-only Gemini turns."""
367
+ if not history or history[-1].get("role") != "assistant":
368
+ return
369
+ if _assistant_display_text(completed_segments, streaming_text).strip():
370
+ history[-1]["content"] = _assistant_display_text(
371
+ completed_segments, streaming_text
372
+ )
373
+ return
374
+ if history[-1].get("content", "").strip():
375
+ return
376
+
377
+ try:
378
+ fallback = assistant_text_since_last_user(client.get_messages())
379
+ except PiRpcError:
380
+ fallback = ""
381
+
382
+ if fallback.strip():
383
+ history[-1]["content"] = fallback
384
+ return
385
+
386
+ if activity:
387
+ history[-1]["content"] = (
388
+ "_This run completed using tools only (no assistant prose was streamed). "
389
+ "See **Thinking log** for step-by-step activity._"
390
+ )
391
+
392
+
393
+ def _gemini_key_error() -> str | None:
394
+ if IS_HF_SPACE and not gemini_api_key_configured():
395
+ return (
396
+ "**Gemini API key required.** Paste your key in **Agent backend** and click "
397
+ "**Apply backend** before chatting or starting a redaction task."
398
+ )
399
+ return None
400
+
401
+
402
+ def _ensure_client(
403
+ client: PiRpcClient | None,
404
+ session_hash: str = "",
405
+ ) -> PiRpcClient:
406
+ key_error = _gemini_key_error()
407
+ if key_error:
408
+ raise PiRpcError(key_error)
409
+ if isinstance(client, PiRpcClient) and client.running:
410
+ return client
411
+ client = default_client(session_hash or None)
412
+ client.start()
413
+ provider = normalize_provider(get_default_provider())
414
+ model = resolved_default_model(provider)
415
+ try:
416
+ client.set_model(provider, model)
417
+ except PiRpcError:
418
+ pass
419
+ return client
420
+
421
+
422
+ def _coerce_client(client: Any) -> PiRpcClient | None:
423
+ return client if isinstance(client, PiRpcClient) else None
424
+
425
+
426
+ def _truncate(text: str, limit: int = TOOL_OUTPUT_MAX) -> str:
427
+ if len(text) <= limit:
428
+ return text
429
+ return text[: limit - 40] + f"\n\n… [{len(text) - limit + 40} chars truncated]"
430
+
431
+
432
+ def _format_activity(lines: list[str]) -> str:
433
+ if not lines:
434
+ return "_No activity yet._"
435
+ return "\n".join(f"- {line}" for line in lines[-ACTIVITY_MAX_LINES:])
436
+
437
+
438
+ def _append_activity(lines: list[str], text: str) -> list[str]:
439
+ text = text.strip()
440
+ if text:
441
+ lines.append(text)
442
+ return lines
443
+
444
+
445
+ def _append_chat_segment(
446
+ completed_segments: list[str],
447
+ streaming_text: str,
448
+ segment: str,
449
+ ) -> tuple[list[str], str]:
450
+ """Append a new visible chat segment (tool line or prose), preserving prior segments."""
451
+ segment = segment.strip()
452
+ if not segment:
453
+ return completed_segments, streaming_text
454
+ if streaming_text.strip():
455
+ completed_segments = completed_segments + [streaming_text.strip()]
456
+ streaming_text = ""
457
+ if not completed_segments or completed_segments[-1] != segment:
458
+ completed_segments = completed_segments + [segment]
459
+ return completed_segments, streaming_text
460
+
461
+
462
+ def _apply_event(
463
+ event: PiStreamEvent,
464
+ *,
465
+ history: list[dict[str, Any]],
466
+ activity: list[str],
467
+ thinking: str,
468
+ tool_output: str,
469
+ tool_heading: str,
470
+ completed_segments: list[str],
471
+ streaming_text: str,
472
+ ) -> tuple[list[dict[str, Any]], list[str], str, str, str, list[str], str]:
473
+ if event.kind == "text_snapshot":
474
+ if event.text.strip().startswith("**") and ":" in event.text.split("\n", 1)[0]:
475
+ completed_segments, streaming_text = _append_chat_segment(
476
+ completed_segments, streaming_text, event.text
477
+ )
478
+ else:
479
+ streaming_text = event.text
480
+ history[-1]["content"] = _assistant_display_text(
481
+ completed_segments, streaming_text
482
+ )
483
+
484
+ elif event.kind == "text_delta":
485
+ streaming_text += event.text
486
+ history[-1]["content"] = _assistant_display_text(
487
+ completed_segments, streaming_text
488
+ )
489
+
490
+ elif event.kind == "thinking_snapshot":
491
+ if SHOW_THINKING:
492
+ thinking = event.text
493
+
494
+ elif event.kind == "thinking_delta":
495
+ if SHOW_THINKING:
496
+ thinking += event.text
497
+
498
+ elif event.kind == "status":
499
+ activity = _append_activity(activity, event.text)
500
+
501
+ elif event.kind == "turn_end":
502
+ activity = _append_activity(activity, event.text)
503
+
504
+ elif event.kind == "tool_start":
505
+ if streaming_text.strip():
506
+ completed_segments.append(streaming_text.strip())
507
+ streaming_text = ""
508
+ label = event.tool_name or "tool"
509
+ detail = event.text or label
510
+ tool_line = f"**{label}:** {detail}" if detail != label else f"**{label}**"
511
+ completed_segments, streaming_text = _append_chat_segment(
512
+ completed_segments, streaming_text, tool_line
513
+ )
514
+ history[-1]["content"] = _assistant_display_text(
515
+ completed_segments, streaming_text
516
+ )
517
+ activity = _append_activity(activity, f"**Tool start:** `{label}` — {detail}")
518
+ tool_heading = f"### {label}\n{detail}\n\n```\n"
519
+ tool_output = ""
520
+
521
+ elif event.kind in {"tool_update", "tool_end"} and SHOW_TOOL_OUTPUT:
522
+ if event.tool_output is not None:
523
+ tool_output = _truncate(event.tool_output)
524
+ if event.kind == "tool_end":
525
+ status = "failed" if event.is_error else "completed"
526
+ activity = _append_activity(
527
+ activity,
528
+ f"**Tool {status}:** `{event.tool_name or 'tool'}`",
529
+ )
530
+
531
+ elif event.kind == "error":
532
+ activity = _append_activity(activity, f"**Error:** {event.text}")
533
+ history[-1]["content"] = _assistant_display_text(
534
+ completed_segments,
535
+ streaming_text,
536
+ )
537
+ history[-1]["content"] += f"\n\n**Error:** {event.text}"
538
+
539
+ elif event.kind == "done":
540
+ if streaming_text.strip():
541
+ completed_segments.append(streaming_text)
542
+ streaming_text = ""
543
+ aborted = event.text.strip().lower().startswith("agent aborted")
544
+ history, completed_segments, streaming_text = _append_agent_finish_notice(
545
+ history,
546
+ completed_segments,
547
+ streaming_text,
548
+ aborted=aborted,
549
+ )
550
+ activity = _append_activity(activity, event.text)
551
+
552
+ return (
553
+ history,
554
+ activity,
555
+ thinking,
556
+ tool_output,
557
+ tool_heading,
558
+ completed_segments,
559
+ streaming_text,
560
+ )
561
+
562
+
563
+ def _format_tool_panel(heading: str, body: str) -> str:
564
+ if not heading and not body:
565
+ return ""
566
+ if heading.endswith("```\n") and body:
567
+ return f"{heading}{body}\n```"
568
+ if heading and not body:
569
+ return heading.rstrip("`") + "…`\n```" if heading.endswith("```\n") else heading
570
+ return heading + body
571
+
572
+
573
+ def _pi_agent_model_label(client: PiRpcClient | None) -> str:
574
+ """Active Pi orchestration model, or configured defaults before Apply backend."""
575
+ if client is not None and client.running:
576
+ try:
577
+ state = client.get_state()
578
+ model = state.get("model") or {}
579
+ provider = str(model.get("provider") or state.get("provider") or "")
580
+ model_label = str(model.get("id") or model.get("name") or "")
581
+ if provider and model_label:
582
+ return f"{provider_label(provider)} / {model_label}"
583
+ return model_label or provider or "—"
584
+ except PiRpcError:
585
+ pass
586
+ provider = normalize_provider(get_default_provider())
587
+ model = resolved_default_model(provider)
588
+ return f"{provider_label(provider)} / {model} (default until backend applied)"
589
+
590
+
591
+ def _agent_status_markdown(client: PiRpcClient | None = None) -> str:
592
+ """Redaction backend URL, Pi model, and credentials — shown at top of the UI."""
593
+ from redaction_prompt import doc_redaction_gradio_url
594
+
595
+ lines = [
596
+ f"**Redaction backend:** `{doc_redaction_gradio_url()}`",
597
+ f"**Pi agent model:** `{_pi_agent_model_label(client)}`",
598
+ ]
599
+ if client is None or not client.running:
600
+ lines.insert(0, "**Status:** Ready")
601
+ lines.append("")
602
+ lines.append(
603
+ "_Set `DOC_REDACTION_GRADIO_URL` in `config/pi_agent.env` if the doc_redaction "
604
+ "app is not at the URL above. Apply **Agent backend** to start Pi._"
605
+ )
606
+ else:
607
+ lines.insert(0, "**Status:** Pi agent connected")
608
+ lines.append("")
609
+ lines.append(credential_status_markdown())
610
+ return " \n".join(lines)
611
+
612
+
613
+ def _session_summary(client: PiRpcClient) -> str:
614
+ try:
615
+ state = client.get_state()
616
+ except PiRpcError as exc:
617
+ return f"{_agent_status_markdown(client)} \n\n_Could not read Pi state: {exc}_"
618
+ session_file = state.get("sessionFile") or "—"
619
+ streaming = state.get("isStreaming")
620
+ compacting = state.get("isCompacting")
621
+ return (
622
+ f"{_agent_status_markdown(client)} \n\n"
623
+ f"**Streaming:** `{streaming}` · **Compacting:** `{compacting}` \n"
624
+ f"**Session log:** `{session_file}`"
625
+ )
626
+
627
+
628
+ def _backend_model_choices_update(provider: str):
629
+ normalized = normalize_provider(provider)
630
+ models = models_for_provider(normalized)
631
+ return gr.update(choices=models, value=default_model_for_provider(normalized))
632
+
633
+
634
+ def apply_backend(
635
+ provider: str,
636
+ model_id: str,
637
+ gemini_api_key: str,
638
+ hf_token: str,
639
+ aws_region: str,
640
+ aws_access_key_id: str,
641
+ aws_secret_access_key: str,
642
+ aws_session_token: str,
643
+ client: PiRpcClient | None,
644
+ session_hash: str,
645
+ ):
646
+ normalized = normalize_provider(provider)
647
+ model = (model_id or default_model_for_provider(normalized)).strip()
648
+ if model not in models_for_provider(normalized):
649
+ model = default_model_for_provider(normalized)
650
+
651
+ apply_session_credentials(
652
+ gemini_api_key=gemini_api_key or None,
653
+ hf_token=hf_token or None,
654
+ aws_region=aws_region or None,
655
+ aws_access_key_id=aws_access_key_id or None,
656
+ aws_secret_access_key=aws_secret_access_key or None,
657
+ aws_session_token=aws_session_token or None,
658
+ )
659
+ if hf_token and hf_token.strip():
660
+ os.environ["_HF_TOKEN_FROM_UI"] = "1"
661
+ write_runtime_config(default_provider=normalized, default_model=model)
662
+
663
+ existing = _coerce_client(client)
664
+ if existing is not None:
665
+ existing.close()
666
+
667
+ key_error = _gemini_key_error()
668
+ if key_error:
669
+ return (
670
+ None,
671
+ key_error,
672
+ gr.update(value=""),
673
+ gr.update(value=""),
674
+ gr.update(value=""),
675
+ gr.update(value=""),
676
+ )
677
+
678
+ rpc = default_client(session_hash or None)
679
+ try:
680
+ rpc.start()
681
+ rpc.set_model(normalized, model)
682
+ rpc.new_session()
683
+ summary = (
684
+ f"**Backend applied:** `{provider_label(normalized)}` / `{model}` \n\n"
685
+ f"{_session_summary(rpc)}"
686
+ )
687
+ except (PiRpcError, FileNotFoundError, OSError) as exc:
688
+ rpc.close()
689
+ rpc = None
690
+ summary = f"**Backend error:** {exc} \n\n{credential_status_markdown()}"
691
+
692
+ return (
693
+ rpc,
694
+ summary,
695
+ gr.update(value=""),
696
+ gr.update(value=""),
697
+ gr.update(value=""),
698
+ gr.update(value=""),
699
+ )
700
+
701
+
702
+ def _init_session_ui(
703
+ request: gr.Request,
704
+ ) -> tuple[str, Any, str, list[str] | None, str]:
705
+ session_hash, explorer, status, s3_prefix = init_session_workspace(request)
706
+ log_platform_access(session_hash, HOST_NAME)
707
+ return (
708
+ session_hash,
709
+ explorer,
710
+ status,
711
+ collect_final_output_files(session_hash),
712
+ s3_prefix,
713
+ )
714
+
715
+
716
+ def _chat_yield(
717
+ history: list[dict[str, Any]],
718
+ client: PiRpcClient,
719
+ activity: list[str],
720
+ thinking: str,
721
+ tool_heading: str,
722
+ tool_output: str,
723
+ *,
724
+ msg: str = "",
725
+ send_enabled: bool = True,
726
+ abort_enabled: bool = False,
727
+ redact_enabled: bool = True,
728
+ session_info: str | None = None,
729
+ session_hash: str = "",
730
+ refresh_final_files: bool = False,
731
+ agent_finish_signal: str = AGENT_FINISH_SIGNAL_NONE,
732
+ ):
733
+ final_files: list[str] | None | dict[str, Any]
734
+ session_log: str | None | dict[str, Any]
735
+ if refresh_final_files:
736
+ final_files = collect_final_output_files(session_hash)
737
+ session_log = collect_session_log_download(client)
738
+ else:
739
+ final_files = gr.update()
740
+ session_log = gr.update()
741
+
742
+ return (
743
+ _clone_history(history),
744
+ client,
745
+ msg,
746
+ _format_activity(activity),
747
+ _format_tool_panel(tool_heading, tool_output),
748
+ _truncate_thinking(thinking),
749
+ session_info if session_info is not None else _session_summary(client),
750
+ gr.update(interactive=send_enabled),
751
+ gr.update(interactive=abort_enabled),
752
+ gr.update(interactive=redact_enabled),
753
+ final_files,
754
+ session_log,
755
+ agent_finish_signal,
756
+ )
757
+
758
+
759
+ def _run_pi_chat(
760
+ message: str,
761
+ history: list[dict[str, Any]] | None,
762
+ client: PiRpcClient | None,
763
+ *,
764
+ chat_user_message: str | None = None,
765
+ session_hash: str = "",
766
+ initial_session_info: str | None = None,
767
+ s3_output_folder: str = "",
768
+ save_outputs_to_s3: bool = False,
769
+ document_name: str = "",
770
+ base_file: str | None = None,
771
+ ocr_method: str = "",
772
+ pii_method: str = "",
773
+ total_page_count: int = 0,
774
+ vlm_model_name: str | None = None,
775
+ redact_file: str | None = None,
776
+ ):
777
+ if not message or not message.strip():
778
+ client = client if client and client.running else None
779
+ hint_activity = [EMPTY_SEND_WITH_FILE_HINT] if redact_file else []
780
+ if client:
781
+ yield _chat_yield(
782
+ history or [],
783
+ client,
784
+ hint_activity,
785
+ "",
786
+ "",
787
+ "",
788
+ session_hash=session_hash,
789
+ )
790
+ else:
791
+ activity_text = (
792
+ _format_activity(hint_activity)
793
+ if hint_activity
794
+ else "_No activity yet._"
795
+ )
796
+ yield (
797
+ history or [],
798
+ None,
799
+ "",
800
+ activity_text,
801
+ "",
802
+ "",
803
+ "_Ready._",
804
+ gr.update(interactive=True),
805
+ gr.update(interactive=False),
806
+ gr.update(interactive=True),
807
+ gr.update(),
808
+ gr.update(),
809
+ AGENT_FINISH_SIGNAL_NONE,
810
+ )
811
+ return
812
+
813
+ history = list(history or [])
814
+ client = _ensure_client(client, session_hash)
815
+ activity: list[str] = []
816
+ thinking = ""
817
+ tool_output = ""
818
+ tool_heading = ""
819
+ completed_segments: list[str] = []
820
+ streaming_text = ""
821
+ task_started_at = time.time()
822
+ usage_baseline = resolve_session_token_usage(client)
823
+
824
+ def _complete_pi_task() -> None:
825
+ usage = usage_for_completed_turn(client, usage_baseline)
826
+ _after_pi_task(
827
+ session_hash=session_hash,
828
+ client=client,
829
+ s3_output_folder=s3_output_folder,
830
+ save_outputs_to_s3=save_outputs_to_s3,
831
+ document_name=document_name,
832
+ started_at=task_started_at,
833
+ base_file=base_file,
834
+ ocr_method=ocr_method,
835
+ pii_method=pii_method,
836
+ total_page_count=total_page_count,
837
+ vlm_model_name=vlm_model_name,
838
+ llm_input_tokens=usage.llm_input_tokens,
839
+ llm_output_tokens=usage.llm_output_tokens,
840
+ )
841
+
842
+ history.append({"role": "user", "content": chat_user_message or message.strip()})
843
+ history.append({"role": "assistant", "content": ""})
844
+ activity = _append_activity(activity, "Prompt sent.")
845
+ if initial_session_info:
846
+ activity = _append_activity(
847
+ activity,
848
+ f"Using workspace `{session_workspace_dir(session_hash).as_posix()}/`.",
849
+ )
850
+ session_info = _session_summary(client)
851
+ if initial_session_info:
852
+ session_info = f"{initial_session_info}\n\n{session_info}"
853
+
854
+ yield _chat_yield(
855
+ history,
856
+ client,
857
+ activity,
858
+ thinking,
859
+ tool_heading,
860
+ tool_output,
861
+ send_enabled=False,
862
+ abort_enabled=True,
863
+ redact_enabled=False,
864
+ session_info=session_info,
865
+ session_hash=session_hash,
866
+ )
867
+
868
+ from pi_workspace_skills import workspace_boundary_prefix
869
+
870
+ pi_message = (
871
+ workspace_boundary_prefix(session_hash)
872
+ + workspace_context_prefix(session_hash)
873
+ + message.strip()
874
+ )
875
+ prompt_to_send = pi_message
876
+ quota_failures = 0
877
+ finish_aborted = False
878
+
879
+ try:
880
+ while True:
881
+ turn_error: str | None = None
882
+ try:
883
+ for event in client.prompt_events(prompt_to_send):
884
+ if event.kind == "done":
885
+ finish_aborted = (
886
+ event.text.strip().lower().startswith("agent aborted")
887
+ )
888
+ (
889
+ history,
890
+ activity,
891
+ thinking,
892
+ tool_output,
893
+ tool_heading,
894
+ completed_segments,
895
+ streaming_text,
896
+ ) = _apply_event(
897
+ event,
898
+ history=history,
899
+ activity=activity,
900
+ thinking=thinking,
901
+ tool_output=tool_output,
902
+ tool_heading=tool_heading,
903
+ completed_segments=completed_segments,
904
+ streaming_text=streaming_text,
905
+ )
906
+ yield _chat_yield(
907
+ history,
908
+ client,
909
+ activity,
910
+ thinking,
911
+ tool_heading,
912
+ tool_output,
913
+ send_enabled=False,
914
+ abort_enabled=True,
915
+ redact_enabled=False,
916
+ session_info=session_info,
917
+ session_hash=session_hash,
918
+ )
919
+ turn_error = last_assistant_turn_error(client.get_messages())
920
+ except PiRpcError as exc:
921
+ if not is_rate_limit_error(str(exc)):
922
+ raise
923
+ turn_error = str(exc)
924
+
925
+ if turn_error and is_rate_limit_error(turn_error):
926
+ quota_failures += 1
927
+ if quota_failures >= QUOTA_RETRY_ATTEMPTS:
928
+ err_summary = turn_error[:500].replace("\n", " ")
929
+ history[-1]["content"] = (
930
+ f"**Gemini rate limit / quota:** stopped after "
931
+ f"{QUOTA_RETRY_ATTEMPTS} consecutive attempts.\n\n"
932
+ f"{err_summary}"
933
+ )
934
+ activity = _append_activity(
935
+ activity,
936
+ f"**Quota retries exhausted** ({QUOTA_RETRY_ATTEMPTS} attempts).",
937
+ )
938
+ history, completed_segments, streaming_text = (
939
+ _append_agent_finish_notice(
940
+ history,
941
+ completed_segments,
942
+ streaming_text,
943
+ error=True,
944
+ )
945
+ )
946
+ _complete_pi_task()
947
+ finish_signal = _notify_agent_finished(error=True)
948
+ yield _chat_yield(
949
+ history,
950
+ client,
951
+ activity,
952
+ thinking,
953
+ tool_heading,
954
+ tool_output,
955
+ send_enabled=True,
956
+ abort_enabled=False,
957
+ redact_enabled=True,
958
+ session_info=_session_summary(client),
959
+ session_hash=session_hash,
960
+ refresh_final_files=True,
961
+ agent_finish_signal=finish_signal,
962
+ )
963
+ return
964
+
965
+ activity = _append_activity(
966
+ activity,
967
+ (
968
+ f"Gemini rate limit — waiting {QUOTA_RETRY_DELAY_S}s before "
969
+ f"retry {quota_failures}/{QUOTA_RETRY_ATTEMPTS}…"
970
+ ),
971
+ )
972
+ yield _chat_yield(
973
+ history,
974
+ client,
975
+ activity,
976
+ thinking,
977
+ tool_heading,
978
+ tool_output,
979
+ send_enabled=False,
980
+ abort_enabled=True,
981
+ redact_enabled=False,
982
+ session_info=session_info,
983
+ session_hash=session_hash,
984
+ )
985
+ time.sleep(QUOTA_RETRY_DELAY_S)
986
+ prompt_to_send = QUOTA_CONTINUE_PROMPT
987
+ history.append({"role": "assistant", "content": ""})
988
+ completed_segments = []
989
+ streaming_text = ""
990
+ continue
991
+
992
+ break
993
+ except PiRpcError as exc:
994
+ history[-1]["content"] = f"**Pi error:** {exc}"
995
+ activity = _append_activity(activity, f"**Pi error:** {exc}")
996
+ history, completed_segments, streaming_text = _append_agent_finish_notice(
997
+ history,
998
+ completed_segments,
999
+ streaming_text,
1000
+ error=True,
1001
+ )
1002
+ _complete_pi_task()
1003
+ finish_signal = _notify_agent_finished(error=True)
1004
+ yield _chat_yield(
1005
+ history,
1006
+ client,
1007
+ activity,
1008
+ thinking,
1009
+ tool_heading,
1010
+ tool_output,
1011
+ send_enabled=True,
1012
+ abort_enabled=False,
1013
+ redact_enabled=True,
1014
+ session_info=_session_summary(client),
1015
+ session_hash=session_hash,
1016
+ refresh_final_files=True,
1017
+ agent_finish_signal=finish_signal,
1018
+ )
1019
+ return
1020
+ except Exception:
1021
+ if client.abort_requested:
1022
+ activity = _append_activity(activity, "**Aborted.**")
1023
+ history, completed_segments, streaming_text = _append_agent_finish_notice(
1024
+ history,
1025
+ completed_segments,
1026
+ streaming_text,
1027
+ aborted=True,
1028
+ )
1029
+ _complete_pi_task()
1030
+ finish_signal = _notify_agent_finished(aborted=True)
1031
+ yield _chat_yield(
1032
+ history,
1033
+ client,
1034
+ activity,
1035
+ thinking,
1036
+ tool_heading,
1037
+ tool_output,
1038
+ send_enabled=True,
1039
+ abort_enabled=False,
1040
+ redact_enabled=True,
1041
+ session_info=_session_summary(client),
1042
+ session_hash=session_hash,
1043
+ refresh_final_files=True,
1044
+ agent_finish_signal=finish_signal,
1045
+ )
1046
+ return
1047
+ raise
1048
+
1049
+ _finalize_assistant_chat(
1050
+ client,
1051
+ history,
1052
+ completed_segments=completed_segments,
1053
+ streaming_text=streaming_text,
1054
+ activity=activity,
1055
+ )
1056
+
1057
+ _complete_pi_task()
1058
+ finish_signal = _notify_agent_finished(aborted=finish_aborted)
1059
+ yield _chat_yield(
1060
+ history,
1061
+ client,
1062
+ activity,
1063
+ thinking,
1064
+ tool_heading,
1065
+ tool_output,
1066
+ send_enabled=True,
1067
+ abort_enabled=False,
1068
+ redact_enabled=True,
1069
+ session_info=_session_summary(client),
1070
+ session_hash=session_hash,
1071
+ refresh_final_files=True,
1072
+ agent_finish_signal=finish_signal,
1073
+ )
1074
+
1075
+
1076
+ def chat_respond(
1077
+ message: str,
1078
+ history: list[dict[str, Any]] | None,
1079
+ client: PiRpcClient | None,
1080
+ session_hash: str,
1081
+ s3_output_folder: str,
1082
+ save_outputs_to_s3: bool,
1083
+ redact_file: str | None,
1084
+ ):
1085
+ yield from _run_pi_chat(
1086
+ message,
1087
+ history,
1088
+ client,
1089
+ session_hash=session_hash,
1090
+ s3_output_folder=s3_output_folder,
1091
+ save_outputs_to_s3=save_outputs_to_s3,
1092
+ redact_file=redact_file,
1093
+ )
1094
+
1095
+
1096
+ def _redaction_page_count(upload_file: str | None, page_range: str) -> int:
1097
+ if not upload_file or not str(upload_file).lower().endswith(".pdf"):
1098
+ return 0
1099
+ try:
1100
+ total = pdf_page_count(upload_file)
1101
+ return pages_to_process_count(page_range or "all", total)
1102
+ except (ValueError, OSError):
1103
+ return 0
1104
+
1105
+
1106
+ def prepare_redaction_session_ui(
1107
+ session_hash: str,
1108
+ request: gr.Request,
1109
+ ) -> tuple[str, str]:
1110
+ """Create session workspace folder before redaction runs (updates UI immediately)."""
1111
+ effective, _workspace, status = prepare_session_workspace(session_hash, request)
1112
+ return effective, status
1113
+
1114
+
1115
+ def submit_redaction_task(
1116
+ upload_file: str | None,
1117
+ user_instructions: str,
1118
+ page_range: str,
1119
+ ocr_method: str,
1120
+ pii_method: str,
1121
+ encourage_vlm_faces: bool,
1122
+ encourage_vlm_signatures: bool,
1123
+ history: list[dict[str, Any]] | None,
1124
+ client: PiRpcClient | None,
1125
+ session_hash: str,
1126
+ s3_output_folder: str,
1127
+ save_outputs_to_s3: bool,
1128
+ request: gr.Request,
1129
+ ):
1130
+ session_hash, _workspace_path, workspace_status = prepare_session_workspace(
1131
+ session_hash, request
1132
+ )
1133
+ settings = (
1134
+ RedactionTaskSettings.hf_space_defaults()
1135
+ if IS_HF_SPACE
1136
+ else RedactionTaskSettings.from_ui(
1137
+ ocr_method,
1138
+ pii_method,
1139
+ encourage_vlm_faces,
1140
+ encourage_vlm_signatures,
1141
+ )
1142
+ )
1143
+ try:
1144
+ _file_name, prompt, renamed_from = prepare_redaction_task(
1145
+ upload_file,
1146
+ user_instructions,
1147
+ page_range=page_range or "all",
1148
+ settings=settings,
1149
+ workspace_dir=_workspace_path,
1150
+ )
1151
+ except (ValueError, FileNotFoundError, OSError) as exc:
1152
+ history = list(history or [])
1153
+ history.append(
1154
+ {"role": "user", "content": f"_Redaction task not started: {exc}_"}
1155
+ )
1156
+ client = (
1157
+ _ensure_client(client, session_hash)
1158
+ if client and client.running
1159
+ else client
1160
+ )
1161
+ yield (
1162
+ _clone_history(history),
1163
+ client,
1164
+ "",
1165
+ _format_activity([f"**Redaction task error:** {exc}"]),
1166
+ "",
1167
+ "",
1168
+ (
1169
+ _session_summary(client)
1170
+ if client and client.running
1171
+ else _agent_status_markdown(client)
1172
+ ),
1173
+ gr.update(interactive=True),
1174
+ gr.update(interactive=False),
1175
+ gr.update(interactive=True),
1176
+ gr.update(),
1177
+ gr.update(),
1178
+ AGENT_FINISH_SIGNAL_NONE,
1179
+ )
1180
+ return
1181
+
1182
+ page_count = _redaction_page_count(upload_file, page_range or "all")
1183
+ chat_summary = (
1184
+ f"**Redaction task:** `{_file_name}` \n"
1185
+ f"**Page range:** `{page_range or 'all'}` \n"
1186
+ f"**OCR / text extraction:** `{settings.ocr_method}` \n"
1187
+ f"**PII model:** `{settings.pii_method}` \n"
1188
+ f"**VLM faces guidance:** {'on' if settings.encourage_vlm_faces else 'off'} \n"
1189
+ f"**VLM signature guidance:** {'on' if settings.encourage_vlm_signatures else 'off'}\n\n"
1190
+ f"{user_instructions.strip()}"
1191
+ )
1192
+ if renamed_from:
1193
+ chat_summary = (
1194
+ f"_Your uploaded file `{renamed_from}` was saved as `{_file_name}` for this "
1195
+ f"task because the original name contained characters that are unsafe for "
1196
+ f"file paths._\n\n{chat_summary}"
1197
+ )
1198
+ yield from _run_pi_chat(
1199
+ prompt,
1200
+ history,
1201
+ client,
1202
+ chat_user_message=chat_summary,
1203
+ session_hash=session_hash,
1204
+ initial_session_info=workspace_status,
1205
+ s3_output_folder=s3_output_folder,
1206
+ save_outputs_to_s3=save_outputs_to_s3,
1207
+ document_name=_file_name,
1208
+ base_file=upload_file,
1209
+ ocr_method=settings.ocr_method,
1210
+ pii_method=settings.pii_method,
1211
+ total_page_count=page_count,
1212
+ vlm_model_name=os.environ.get("PI_VLM_MODEL"),
1213
+ )
1214
+
1215
+
1216
+ def abort_agent(client: PiRpcClient | None):
1217
+ rpc = _coerce_client(client)
1218
+ if rpc is not None and rpc.running:
1219
+ try:
1220
+ rpc.abort()
1221
+ except (PiRpcError, OSError, ValueError):
1222
+ pass
1223
+ return (
1224
+ gr.update(interactive=True),
1225
+ gr.update(interactive=False),
1226
+ gr.update(interactive=True),
1227
+ )
1228
+
1229
+
1230
+ def new_chat(
1231
+ _history,
1232
+ client: PiRpcClient | None,
1233
+ session_hash: str,
1234
+ ):
1235
+ if client is not None:
1236
+ try:
1237
+ client.new_session()
1238
+ except PiRpcError:
1239
+ client.close()
1240
+ client = default_client(session_hash or None)
1241
+ client.start()
1242
+ else:
1243
+ client = default_client(session_hash or None)
1244
+ client.start()
1245
+ return _chat_yield(
1246
+ [],
1247
+ client,
1248
+ ["New session."],
1249
+ "",
1250
+ "",
1251
+ "",
1252
+ session_hash=session_hash,
1253
+ refresh_final_files=True,
1254
+ )
1255
+
1256
+
1257
+ def _startup_session_info() -> str:
1258
+ if IS_HF_SPACE:
1259
+ return (
1260
+ "**Hugging Face Space profile** — Gemini orchestration with remote Document Redaction App "
1261
+ "backend. \n\n"
1262
+ "1. Paste your **Gemini API key** (and optional **HF token** for a private "
1263
+ "redaction Space). \n"
1264
+ "2. Click **Apply backend**. \n\n"
1265
+ f"{_agent_status_markdown(None)}"
1266
+ )
1267
+ return _agent_status_markdown(None)
1268
+
1269
+
1270
+ def build_ui():
1271
+ hf_redaction_blurb = (
1272
+ "Upload a document and add bullet-point requirements. Redaction runs on a **remote** "
1273
+ "Redaction App Hugging Face Space. \n"
1274
+ "When ready, use **Start redaction task** under the chat panel to the right."
1275
+ if IS_HF_SPACE
1276
+ else (
1277
+ "Upload a PDF (or other supported document). Add bullet-point instructions for redaction below. \n"
1278
+ "When ready, use **Start redaction task** under the chat panel to the right."
1279
+ )
1280
+ )
1281
+ backend_blurb = (
1282
+ "Gemini powers the Pi agent on this Space. Paste your **Gemini API key** "
1283
+ "(session-only, not stored on disk). Optionally override the **HF token** used "
1284
+ "to reach the private redaction backend."
1285
+ if IS_HF_SPACE
1286
+ else (
1287
+ "Choose which LLM powers the Pi agent (chat and redaction orchestration). "
1288
+ "Credentials from the UI apply **for this container session only**; "
1289
+ "defaults can be set via `config/pi_agent.env` or compose environment."
1290
+ )
1291
+ )
1292
+ hf_locked_settings_md = (
1293
+ f"**Locked defaults (HF Space):** \n"
1294
+ f"- Text extraction: `{DEFAULT_OCR_METHOD}` \n"
1295
+ f"- PII model: `{DEFAULT_PII_METHOD}` \n"
1296
+ f"- Face/signature VLM: unavailable"
1297
+ if IS_HF_SPACE
1298
+ else ""
1299
+ )
1300
+
1301
+ with gr.Blocks(
1302
+ title=PI_UI_TITLE,
1303
+ fill_height=True,
1304
+ ) as demo:
1305
+ gr.Markdown(PI_INTRO_TEXT)
1306
+ client_state = gr.State(None)
1307
+ session_hash_state = gr.State("")
1308
+ s3_output_folder_state = gr.State("")
1309
+ save_outputs_to_s3_state = gr.State(SAVE_OUTPUTS_TO_S3)
1310
+
1311
+ with gr.Accordion("View session info", open=False):
1312
+ session_info = gr.Markdown(_startup_session_info())
1313
+
1314
+ with gr.Row(equal_height=False):
1315
+ with gr.Column(scale=2):
1316
+
1317
+ with gr.Accordion("Redaction task", open=True):
1318
+ gr.Markdown(hf_redaction_blurb)
1319
+
1320
+ pi_example_rows, pi_example_labels = example_rows()
1321
+
1322
+ redact_file = gr.File(
1323
+ label="Document to redact",
1324
+ file_types=[
1325
+ ".pdf",
1326
+ ".png",
1327
+ ".jpg",
1328
+ ".jpeg",
1329
+ ".docx",
1330
+ ".csv",
1331
+ ".xlsx",
1332
+ ],
1333
+ type="filepath",
1334
+ render=False,
1335
+ )
1336
+ redact_instructions = gr.Textbox(
1337
+ label="Redaction requirements",
1338
+ placeholder=(
1339
+ "- Redact all personal names\n"
1340
+ "- Remove organisation addresses\n"
1341
+ "- Keep publication titles visible"
1342
+ ),
1343
+ lines=8,
1344
+ render=False,
1345
+ )
1346
+ page_range = gr.Textbox(
1347
+ label="Page range",
1348
+ value="all",
1349
+ placeholder="all or e.g. 1-56",
1350
+ render=False,
1351
+ )
1352
+ if IS_HF_SPACE:
1353
+ ocr_method = gr.State(DEFAULT_OCR_METHOD)
1354
+ pii_method = gr.State(DEFAULT_PII_METHOD)
1355
+ encourage_vlm_faces = gr.State(False)
1356
+ encourage_vlm_signatures = gr.State(False)
1357
+ settings_accordion = None
1358
+ else:
1359
+ settings_accordion = gr.Accordion(
1360
+ "Redaction settings (prompt defaults)",
1361
+ open=False,
1362
+ render=False,
1363
+ )
1364
+ with settings_accordion:
1365
+ gr.Markdown(
1366
+ "These values are injected into the task prompt under "
1367
+ "**Technical constraints** — they suggest defaults to Pi for "
1368
+ "`/doc_redact`, not hard-coded app settings."
1369
+ )
1370
+ ocr_method = gr.Dropdown(
1371
+ label="Default text extraction method",
1372
+ choices=list(OCR_METHOD_CHOICES),
1373
+ value=DEFAULT_OCR_METHOD,
1374
+ allow_custom_value=True,
1375
+ )
1376
+ pii_method = gr.Dropdown(
1377
+ label="Default PII identification model",
1378
+ choices=list(PII_METHOD_CHOICES),
1379
+ value=DEFAULT_PII_METHOD,
1380
+ allow_custom_value=True,
1381
+ )
1382
+ encourage_vlm_faces = gr.Checkbox(
1383
+ label="Encourage CUSTOM_VLM_FACES when user asks to redact faces",
1384
+ value=True,
1385
+ )
1386
+ encourage_vlm_signatures = gr.Checkbox(
1387
+ label=(
1388
+ "Encourage CUSTOM_VLM_SIGNATURE when user asks "
1389
+ "to redact signatures"
1390
+ ),
1391
+ value=True,
1392
+ )
1393
+
1394
+ if pi_example_rows:
1395
+ gr.Markdown(
1396
+ "### Try an example\n"
1397
+ "Click a row to load the sample PDF and redaction instructions, "
1398
+ "then **Start redaction task** under the chat panel to the right."
1399
+ )
1400
+ gr.Examples(
1401
+ examples=pi_example_rows,
1402
+ inputs=[
1403
+ redact_file,
1404
+ redact_instructions,
1405
+ page_range,
1406
+ ocr_method,
1407
+ pii_method,
1408
+ encourage_vlm_faces,
1409
+ encourage_vlm_signatures,
1410
+ ],
1411
+ example_labels=pi_example_labels,
1412
+ examples_per_page=2,
1413
+ cache_examples=False,
1414
+ )
1415
+ else:
1416
+ gr.Markdown(examples_status_markdown())
1417
+
1418
+ redact_file.render()
1419
+ redact_instructions.render()
1420
+ page_range.render()
1421
+ if IS_HF_SPACE:
1422
+ gr.Markdown(hf_locked_settings_md)
1423
+ elif settings_accordion is not None:
1424
+ settings_accordion.render()
1425
+
1426
+ with gr.Accordion("Agent backend/API keys", open=IS_HF_SPACE):
1427
+ gr.Markdown(backend_blurb)
1428
+ backend_provider = gr.Radio(
1429
+ label="Provider",
1430
+ choices=[
1431
+ (provider_label(key), key) for key in provider_choices()
1432
+ ],
1433
+ value=get_default_provider(),
1434
+ )
1435
+ backend_model = gr.Dropdown(
1436
+ label="Model",
1437
+ choices=models_for_provider(get_default_provider()),
1438
+ value=default_model_for_provider(get_default_provider()),
1439
+ allow_custom_value=True,
1440
+ )
1441
+ gemini_api_key = gr.Textbox(
1442
+ label=(
1443
+ "Gemini API key (required on HF Space)"
1444
+ if IS_HF_SPACE
1445
+ else "Gemini API key (session override)"
1446
+ ),
1447
+ type="password",
1448
+ placeholder=(
1449
+ "Required — get a key from Google AI Studio"
1450
+ if IS_HF_SPACE
1451
+ else "Uses GEMINI_API_KEY / GOOGLE_API_KEY from env if empty"
1452
+ ),
1453
+ )
1454
+ hf_token = gr.Textbox(
1455
+ label="HF token for redaction Space (session override)",
1456
+ type="password",
1457
+ placeholder="Uses HF_TOKEN Space secret if empty",
1458
+ visible=IS_HF_SPACE,
1459
+ )
1460
+ with gr.Accordion("AWS credentials (optional)", open=False):
1461
+ aws_region = gr.Textbox(
1462
+ label="AWS region (session override)",
1463
+ placeholder="e.g. eu-west-2",
1464
+ visible=not IS_HF_SPACE,
1465
+ )
1466
+ aws_access_key_id = gr.Textbox(
1467
+ label="AWS access key ID (session override)",
1468
+ type="password",
1469
+ visible=not IS_HF_SPACE,
1470
+ )
1471
+ aws_secret_access_key = gr.Textbox(
1472
+ label="AWS secret access key (session override)",
1473
+ type="password",
1474
+ visible=not IS_HF_SPACE,
1475
+ )
1476
+ aws_session_token = gr.Textbox(
1477
+ label="AWS session token (optional)",
1478
+ type="password",
1479
+ visible=False, # not IS_HF_SPACE,
1480
+ )
1481
+ apply_backend_btn = gr.Button(
1482
+ "Apply backend",
1483
+ variant="primary",
1484
+ )
1485
+
1486
+ with gr.Column(scale=3):
1487
+ chatbot = gr.Chatbot(label="Task progress", height=480)
1488
+ with gr.Row():
1489
+ start_redact_btn = gr.Button(
1490
+ "Start redaction task",
1491
+ variant="primary",
1492
+ )
1493
+ abort_btn = gr.Button("Abort", variant="stop", interactive=False)
1494
+ clear = gr.Button("New session")
1495
+ with gr.Accordion("Follow-up chat (optional)", open=False):
1496
+ msg = gr.Textbox(
1497
+ label="Message",
1498
+ placeholder=(
1499
+ "Optional message after a redaction task (e.g. fix page 3)"
1500
+ ),
1501
+ lines=3,
1502
+ )
1503
+ send = gr.Button("Send follow-up", variant="secondary")
1504
+
1505
+ with gr.Accordion("Thinking log", open=False):
1506
+ activity_log = gr.Markdown(
1507
+ value="_No activity yet._", max_height=480, height=480
1508
+ )
1509
+ tool_panel = gr.Markdown(value="", max_height=480, height=480)
1510
+ thinking_panel = gr.Textbox(
1511
+ label="Thinking (stream)",
1512
+ lines=12,
1513
+ max_lines=50,
1514
+ interactive=False,
1515
+ visible=SHOW_THINKING,
1516
+ elem_classes=["thinking-panel"],
1517
+ autoscroll=True,
1518
+ )
1519
+
1520
+ with gr.Accordion("Workspace output files", open=True):
1521
+ workspace_session_info = gr.Markdown(
1522
+ "_Loading your session workspace…_",
1523
+ )
1524
+ gr.Markdown(
1525
+ "**Final outputs** will appear below. "
1526
+ "Downloads below are available in your session's `output_final_download/` folder."
1527
+ "Use the file explorer below to browse or download other workspace files."
1528
+ )
1529
+ workspace_output_download = gr.File(
1530
+ label="Final deliverables (download)",
1531
+ file_count="multiple",
1532
+ file_types=[
1533
+ ".pdf",
1534
+ ".jpg",
1535
+ ".jpeg",
1536
+ ".png",
1537
+ ".csv",
1538
+ ".xlsx",
1539
+ ".xls",
1540
+ ".txt",
1541
+ ".doc",
1542
+ ".docx",
1543
+ ".json",
1544
+ ".zip",
1545
+ ],
1546
+ interactive=False,
1547
+ height=200,
1548
+ )
1549
+ refresh_outputs_btn = gr.Button(
1550
+ "Refresh workspace files",
1551
+ variant="secondary",
1552
+ )
1553
+ workspace_output_explorer = gr.FileExplorer(
1554
+ root_dir=str(workspace_base_dir()),
1555
+ label="Browse session workspace",
1556
+ file_count="multiple",
1557
+ interactive=True,
1558
+ max_height=400,
1559
+ )
1560
+
1561
+ with gr.Accordion("Session log outputs", open=False):
1562
+ gr.Markdown(
1563
+ "Pi writes a **JSONL** transcript for the active agent session under "
1564
+ "its `sessions/` directory. The file refreshes after each chat message "
1565
+ "or redaction task completes."
1566
+ )
1567
+ session_log_download = gr.File(
1568
+ label="Pi session log (JSONL)",
1569
+ file_count="single",
1570
+ file_types=[".jsonl"],
1571
+ interactive=False,
1572
+ )
1573
+ agent_finish_signal = gr.State(AGENT_FINISH_SIGNAL_NONE)
1574
+
1575
+ chat_outputs = [
1576
+ chatbot,
1577
+ client_state,
1578
+ msg,
1579
+ activity_log,
1580
+ tool_panel,
1581
+ thinking_panel,
1582
+ session_info,
1583
+ send,
1584
+ abort_btn,
1585
+ start_redact_btn,
1586
+ workspace_output_download,
1587
+ session_log_download,
1588
+ agent_finish_signal,
1589
+ ]
1590
+
1591
+ run_chat_send = send.click(
1592
+ chat_respond,
1593
+ inputs=[
1594
+ msg,
1595
+ chatbot,
1596
+ client_state,
1597
+ session_hash_state,
1598
+ s3_output_folder_state,
1599
+ save_outputs_to_s3_state,
1600
+ redact_file,
1601
+ ],
1602
+ outputs=chat_outputs,
1603
+ )
1604
+ run_chat_send.then(
1605
+ _passthrough_chat_outputs,
1606
+ outputs=chat_outputs,
1607
+ js=PI_AGENT_FINISH_NOTIFY_JS,
1608
+ )
1609
+ run_chat_msg = msg.submit(
1610
+ chat_respond,
1611
+ inputs=[
1612
+ msg,
1613
+ chatbot,
1614
+ client_state,
1615
+ session_hash_state,
1616
+ s3_output_folder_state,
1617
+ save_outputs_to_s3_state,
1618
+ redact_file,
1619
+ ],
1620
+ outputs=chat_outputs,
1621
+ )
1622
+ run_chat_msg.then(
1623
+ _passthrough_chat_outputs,
1624
+ outputs=chat_outputs,
1625
+ js=PI_AGENT_FINISH_NOTIFY_JS,
1626
+ )
1627
+ run_redact_prepare = start_redact_btn.click(
1628
+ prepare_redaction_session_ui,
1629
+ inputs=[session_hash_state],
1630
+ outputs=[session_hash_state, workspace_session_info],
1631
+ )
1632
+ run_redact_task = run_redact_prepare.then(
1633
+ submit_redaction_task,
1634
+ inputs=[
1635
+ redact_file,
1636
+ redact_instructions,
1637
+ page_range,
1638
+ ocr_method,
1639
+ pii_method,
1640
+ encourage_vlm_faces,
1641
+ encourage_vlm_signatures,
1642
+ chatbot,
1643
+ client_state,
1644
+ session_hash_state,
1645
+ s3_output_folder_state,
1646
+ save_outputs_to_s3_state,
1647
+ ],
1648
+ outputs=chat_outputs,
1649
+ )
1650
+ run_redact_task.then(
1651
+ _passthrough_chat_outputs,
1652
+ outputs=chat_outputs,
1653
+ js=PI_AGENT_FINISH_NOTIFY_JS,
1654
+ )
1655
+ abort_btn.click(
1656
+ abort_agent,
1657
+ inputs=[client_state],
1658
+ outputs=[send, abort_btn, start_redact_btn],
1659
+ cancels=[run_chat_send, run_chat_msg, run_redact_task],
1660
+ queue=False,
1661
+ )
1662
+ clear.click(
1663
+ new_chat,
1664
+ inputs=[chatbot, client_state, session_hash_state],
1665
+ outputs=chat_outputs,
1666
+ )
1667
+
1668
+ if not IS_HF_SPACE:
1669
+ backend_provider.change(
1670
+ _backend_model_choices_update,
1671
+ inputs=[backend_provider],
1672
+ outputs=[backend_model],
1673
+ )
1674
+ apply_backend_btn.click(
1675
+ apply_backend,
1676
+ inputs=[
1677
+ backend_provider,
1678
+ backend_model,
1679
+ gemini_api_key,
1680
+ hf_token,
1681
+ aws_region,
1682
+ aws_access_key_id,
1683
+ aws_secret_access_key,
1684
+ aws_session_token,
1685
+ client_state,
1686
+ session_hash_state,
1687
+ ],
1688
+ outputs=[
1689
+ client_state,
1690
+ session_info,
1691
+ gemini_api_key,
1692
+ hf_token,
1693
+ aws_secret_access_key,
1694
+ aws_session_token,
1695
+ ],
1696
+ )
1697
+
1698
+ refresh_outputs_btn.click(
1699
+ fn=refresh_workspace_output_files_stub,
1700
+ inputs=None,
1701
+ outputs=workspace_output_explorer,
1702
+ ).success(
1703
+ fn=refresh_workspace_panel,
1704
+ inputs=[session_hash_state],
1705
+ outputs=[workspace_output_explorer, workspace_output_download],
1706
+ ).success(
1707
+ fn=_export_workspace_outputs,
1708
+ inputs=[
1709
+ session_hash_state,
1710
+ s3_output_folder_state,
1711
+ save_outputs_to_s3_state,
1712
+ ],
1713
+ outputs=None,
1714
+ )
1715
+
1716
+ workspace_output_explorer.input(
1717
+ fn=workspace_files_download_fn,
1718
+ inputs=[workspace_output_explorer, session_hash_state],
1719
+ outputs=workspace_output_download,
1720
+ )
1721
+
1722
+ demo.load(
1723
+ fn=_init_session_ui,
1724
+ inputs=None,
1725
+ outputs=[
1726
+ session_hash_state,
1727
+ workspace_output_explorer,
1728
+ workspace_session_info,
1729
+ workspace_output_download,
1730
+ s3_output_folder_state,
1731
+ ],
1732
+ )
1733
+
1734
+ return demo
1735
+
1736
+
1737
+ def launch_pi_ui() -> FastAPI | None:
1738
+ """Build UI and mount on FastAPI or launch Gradio directly."""
1739
+ demo = build_ui()
1740
+ demo.queue(default_concurrency_limit=1)
1741
+ return mount_or_launch(
1742
+ demo,
1743
+ fastapi_app=create_fastapi_app() if RUN_FASTAPI else None,
1744
+ allowed_paths=gradio_allowed_paths(),
1745
+ css=THINKING_PANEL_CSS,
1746
+ head_extra=PI_AGENT_FINISH_HEAD_HTML,
1747
+ server_name=PI_UI_HOST,
1748
+ server_port=PI_UI_PORT,
1749
+ )
1750
+
1751
+
1752
+ if RUN_FASTAPI:
1753
+ app = launch_pi_ui()
1754
+ else:
1755
+ app = None
1756
+
1757
+
1758
+ if __name__ == "__main__":
1759
+ if RUN_FASTAPI:
1760
+ import uvicorn
1761
+
1762
+ uvicorn.run(
1763
+ "gradio_app:app",
1764
+ host=PI_UI_HOST,
1765
+ port=PI_UI_PORT,
1766
+ factory=False,
1767
+ )
1768
+ else:
1769
+ launch_pi_ui()
agent-redact/pi/output_files.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browse and download files from the Pi agent shared workspace."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import gradio as gr
12
+ from bootstrap_pi_config import pi_repo_root_path
13
+ from pi_examples import gradio_example_allowed_paths
14
+ from session_logs import gradio_session_log_allowed_paths
15
+ from session_workspace import (
16
+ sanitize_session_id,
17
+ session_workspace_dir,
18
+ workspace_base_dir,
19
+ )
20
+
21
+ REFRESH_STUB_DIR = Path(os.environ.get("PI_FILEEXPLORER_STUB_DIR", "/tmp"))
22
+
23
+ # Folder names under ``.../review/`` where Pass 1 deliverables are saved (see partnership prompt).
24
+ _DEFAULT_FINAL_OUTPUT_FOLDER_NAMES = ("output_review_final", "output_final")
25
+ _DEFAULT_FINAL_DOWNLOAD_FOLDER = "output_final_download"
26
+ _DEFAULT_GRADIO_PREFIX_MIN_LEN = 16
27
+
28
+
29
+ def final_output_folder_names() -> frozenset[str]:
30
+ raw = os.environ.get("PI_FINAL_OUTPUT_FOLDER_NAMES", "").strip()
31
+ if raw:
32
+ names = {part.strip() for part in raw.split(",") if part.strip()}
33
+ if names:
34
+ return frozenset(names)
35
+ return frozenset(_DEFAULT_FINAL_OUTPUT_FOLDER_NAMES)
36
+
37
+
38
+ def _is_under_final_output_dir(relative_path: Path) -> bool:
39
+ parts = relative_path.parts
40
+ names = final_output_folder_names()
41
+ for index, part in enumerate(parts):
42
+ if part == "review" and index + 1 < len(parts):
43
+ if parts[index + 1] in names:
44
+ return True
45
+ return False
46
+
47
+
48
+ def final_download_folder_name() -> str:
49
+ raw = os.environ.get("PI_FINAL_DOWNLOAD_FOLDER", _DEFAULT_FINAL_DOWNLOAD_FOLDER)
50
+ stripped = raw.strip() if raw else ""
51
+ return stripped or _DEFAULT_FINAL_DOWNLOAD_FOLDER
52
+
53
+
54
+ def final_download_dir(session_hash: str | None = None) -> Path:
55
+ """
56
+ Per-session staging folder for ``gr.File`` downloads.
57
+
58
+ Always ``{PI_WORKSPACE_DIR}/{session_id}/output_final_download/`` when a session
59
+ id is known, even if the broader workspace is shared (``PI_SESSION_WORKSPACE=false``).
60
+ """
61
+ base = workspace_base_dir().resolve()
62
+ folder = final_download_folder_name()
63
+ if not session_hash or not str(session_hash).strip():
64
+ return base / folder
65
+ safe_id = sanitize_session_id(str(session_hash))
66
+ return base / safe_id / folder
67
+
68
+
69
+ def _remove_path(path: Path) -> None:
70
+ """Best-effort delete (handles read-only / OneDrive locks on Windows)."""
71
+ try:
72
+ if path.is_dir() and not path.is_symlink():
73
+ shutil.rmtree(path, ignore_errors=True)
74
+ else:
75
+ path.unlink(missing_ok=True)
76
+ except OSError:
77
+ if not path.exists():
78
+ return
79
+ try:
80
+ os.chmod(path, 0o666)
81
+ if path.is_dir() and not path.is_symlink():
82
+ shutil.rmtree(path, ignore_errors=True)
83
+ else:
84
+ path.unlink(missing_ok=True)
85
+ except OSError:
86
+ pass
87
+
88
+
89
+ def _reset_download_dir(download_dir: Path) -> None:
90
+ """Clear staged downloads without removing the directory inode (safer on Windows)."""
91
+ download_dir.mkdir(parents=True, exist_ok=True)
92
+ for child in download_dir.iterdir():
93
+ _remove_path(child)
94
+
95
+
96
+ def _gradio_prefix_min_len() -> int:
97
+ raw = os.environ.get(
98
+ "PI_GRADIO_FILENAME_PREFIX_MIN_LEN",
99
+ str(_DEFAULT_GRADIO_PREFIX_MIN_LEN),
100
+ )
101
+ try:
102
+ return max(1, int(raw))
103
+ except ValueError:
104
+ return _DEFAULT_GRADIO_PREFIX_MIN_LEN
105
+
106
+
107
+ def strip_gradio_cache_prefix(filename: str) -> str:
108
+ """
109
+ Remove a leading Gradio cache id prefix (``{alphanumeric}_{name}``).
110
+
111
+ Gradio client downloads often prefix filenames with a long hash so repeated
112
+ exports do not collide; users expect the original basename instead.
113
+ """
114
+ pattern = re.compile(rf"^[A-Za-z0-9]{{{_gradio_prefix_min_len()},}}_(.+)$")
115
+ match = pattern.match(filename)
116
+ if match:
117
+ return match.group(1)
118
+ return filename
119
+
120
+
121
+ def _file_created_timestamp(path: Path) -> float:
122
+ stat = path.stat()
123
+ birth = getattr(stat, "st_birthtime", None)
124
+ if birth is not None and birth > 0:
125
+ return float(birth)
126
+ return float(stat.st_mtime)
127
+
128
+
129
+ def _collect_raw_final_output_files(
130
+ session_hash: str | None = None,
131
+ ) -> list[Path] | None:
132
+ """
133
+ Collect deliverable files from ``review/output_review_final/`` (and aliases)
134
+ anywhere under the session workspace.
135
+ """
136
+ root = workspace_root_from(session_hash)
137
+ if not root.is_dir():
138
+ return None
139
+
140
+ download_folder = final_download_folder_name()
141
+ candidates: list[Path] = []
142
+ try:
143
+ for path in root.rglob("*"):
144
+ if not path.is_file() or not _is_file_path(path.name):
145
+ continue
146
+ try:
147
+ relative = path.relative_to(root)
148
+ except ValueError:
149
+ continue
150
+ if download_folder in relative.parts:
151
+ continue
152
+ if not _is_under_final_output_dir(relative):
153
+ continue
154
+ try:
155
+ path.resolve(strict=False).relative_to(root)
156
+ except ValueError:
157
+ continue
158
+ candidates.append(path)
159
+ except OSError:
160
+ return None
161
+
162
+ if not candidates:
163
+ return None
164
+ return candidates
165
+
166
+
167
+ def build_final_download_files(
168
+ session_hash: str | None = None,
169
+ ) -> list[str] | None:
170
+ """
171
+ Stage cleaned deliverables under ``{session_id}/output_final_download/``.
172
+
173
+ Copies files from agent final-output folders, strips Gradio cache prefixes,
174
+ deduplicates by basename (newest file wins), and returns paths for ``gr.File``.
175
+ """
176
+ raw_files = _collect_raw_final_output_files(session_hash)
177
+ if not raw_files:
178
+ return None
179
+
180
+ download_dir = final_download_dir(session_hash)
181
+ _reset_download_dir(download_dir)
182
+
183
+ ordered = sorted(raw_files, key=_file_created_timestamp)
184
+ latest_by_name: dict[str, Path] = {}
185
+ for path in ordered:
186
+ latest_by_name[strip_gradio_cache_prefix(path.name)] = path
187
+
188
+ staged: list[str] = []
189
+ for name in sorted(latest_by_name):
190
+ source = latest_by_name[name]
191
+ destination = download_dir / name
192
+ destination.parent.mkdir(parents=True, exist_ok=True)
193
+ shutil.copy2(source, destination)
194
+ staged.append(str(destination.resolve()))
195
+ return staged or None
196
+
197
+
198
+ def collect_final_output_files(
199
+ session_hash: str | None = None,
200
+ ) -> list[str] | None:
201
+ """Return deduplicated, prefix-stripped deliverables for download and S3 export."""
202
+ return build_final_download_files(session_hash)
203
+
204
+
205
+ def workspace_root_from(session_hash: str | None = None) -> Path:
206
+ """Resolve the session workspace from a sanitized Gradio session hash only."""
207
+ if not session_hash or not str(session_hash).strip():
208
+ return workspace_base_dir().resolve()
209
+ return session_workspace_dir(str(session_hash).strip())
210
+
211
+
212
+ def _is_file_path(path: str) -> bool:
213
+ if not path or not path.strip():
214
+ return False
215
+ name = Path(path.rstrip("/\\")).name
216
+ if not name or "." not in name:
217
+ return False
218
+ ext = name.rsplit(".", 1)[-1]
219
+ return bool(ext and len(ext) <= 10 and ext.isalnum())
220
+
221
+
222
+ def _is_safe_workspace_relative_path(path: str) -> bool:
223
+ """Reject absolute paths and traversal segments before joining under workspace."""
224
+ if not path or not path.strip():
225
+ return False
226
+ candidate = Path(path.strip())
227
+ if candidate.is_absolute() or candidate.anchor:
228
+ return False
229
+ return all(part not in ("", ".", "..") for part in candidate.parts)
230
+
231
+
232
+ def _resolve_under_workspace(
233
+ path: str,
234
+ *,
235
+ workspace_root: Path | None = None,
236
+ ) -> Path | None:
237
+ if not path or not path.strip():
238
+ return None
239
+
240
+ root = (workspace_root or workspace_base_dir()).resolve()
241
+ stripped = path.strip()
242
+ try:
243
+ user_path = Path(stripped)
244
+ if user_path.is_absolute():
245
+ # Gradio FileExplorer may return absolute paths already under root_dir.
246
+ resolved = user_path.resolve(strict=False)
247
+ elif _is_safe_workspace_relative_path(stripped):
248
+ resolved = root.joinpath(*user_path.parts).resolve(strict=False)
249
+ else:
250
+ return None
251
+ resolved.relative_to(root)
252
+ except (ValueError, OSError):
253
+ return None
254
+ return resolved if resolved.is_file() else None
255
+
256
+
257
+ def load_workspace_output_files(session_hash: str = ""):
258
+ root = workspace_root_from(session_hash or None)
259
+ root.mkdir(parents=True, exist_ok=True)
260
+ return gr.FileExplorer(root_dir=str(root))
261
+
262
+
263
+ def refresh_workspace_output_files_stub():
264
+ return gr.FileExplorer(root_dir=str(REFRESH_STUB_DIR.resolve()))
265
+
266
+
267
+ def gradio_allowed_paths() -> list[str]:
268
+ """Paths Gradio may serve via gr.File (must include the shared workspace)."""
269
+ paths: list[str] = []
270
+ for raw in (
271
+ workspace_base_dir(),
272
+ str(pi_repo_root_path()),
273
+ REFRESH_STUB_DIR,
274
+ "/tmp",
275
+ ):
276
+ try:
277
+ resolved = str(Path(raw).resolve())
278
+ except OSError:
279
+ continue
280
+ if resolved not in paths:
281
+ paths.append(resolved)
282
+ for raw in gradio_example_allowed_paths():
283
+ if raw not in paths:
284
+ paths.append(raw)
285
+ for raw in gradio_session_log_allowed_paths():
286
+ if raw not in paths:
287
+ paths.append(raw)
288
+ return paths
289
+
290
+
291
+ def refresh_workspace_panel(
292
+ session_hash: str = "",
293
+ ) -> tuple[Any, list[str] | None]:
294
+ """Refresh file explorer and auto-detected final deliverables."""
295
+ return (
296
+ load_workspace_output_files(session_hash),
297
+ collect_final_output_files(session_hash),
298
+ )
299
+
300
+
301
+ def workspace_files_download_fn(
302
+ selected: list[str] | None,
303
+ session_hash: str = "",
304
+ ) -> list[str] | None:
305
+ """Return only file paths under the session workspace (for gr.File download)."""
306
+ if not selected:
307
+ return None
308
+ root = workspace_root_from(session_hash or None)
309
+ downloads: list[str] = []
310
+ for raw in selected:
311
+ if not _is_file_path(raw):
312
+ continue
313
+ resolved = _resolve_under_workspace(raw, workspace_root=root)
314
+ if resolved is not None:
315
+ downloads.append(str(resolved))
316
+ return downloads or None
agent-redact/pi/pi_agent_config.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate Pi agent models.json and settings.json at runtime."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ def resolve_agent_dir() -> Path:
12
+ return Path(os.environ.get("PI_CODING_AGENT_DIR", Path.home() / ".pi" / "agent"))
13
+
14
+
15
+ # Back-compat alias; prefer resolve_agent_dir() when env may change after import.
16
+ AGENT_DIR = resolve_agent_dir()
17
+ TEMPLATE_DIR = Path(__file__).resolve().parent / "agent"
18
+ SETTINGS_TEMPLATE = TEMPLATE_DIR / "settings.json"
19
+
20
+ DEPLOYMENT_LOCAL = "local-docker"
21
+ DEPLOYMENT_HF_SPACE = "hf-space"
22
+ DEPLOYMENT_PROFILE = (
23
+ os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
24
+ )
25
+
26
+
27
+ def pi_max_retries() -> int:
28
+ """Max retries for Pi auto-retry and Gradio quota backoff (env: PI_MAX_RETRIES, default 5)."""
29
+ raw = (
30
+ os.environ.get("PI_QUOTA_RETRY_ATTEMPTS")
31
+ or os.environ.get("PI_MAX_RETRIES")
32
+ or "5"
33
+ ).strip()
34
+ return int(raw)
35
+
36
+
37
+ def _apply_retry_settings(
38
+ settings: dict[str, Any],
39
+ *,
40
+ provider: str,
41
+ ) -> None:
42
+ """Write Pi ``settings.json`` retry block (Gemini uses longer delays)."""
43
+ max_retries = pi_max_retries()
44
+ gemini_delays = provider == PROVIDER_GEMINI or is_hf_space_profile()
45
+ base_delay_ms = 2000
46
+ max_delay_ms = 60000
47
+ if gemini_delays:
48
+ base_delay_ms = int(os.environ.get("PI_GEMINI_RETRY_BASE_DELAY_MS", "60000"))
49
+ max_delay_ms = int(os.environ.get("PI_GEMINI_RETRY_MAX_DELAY_MS", "90000"))
50
+ settings["retry"] = {
51
+ "enabled": True,
52
+ "maxRetries": max_retries,
53
+ "baseDelayMs": base_delay_ms,
54
+ "provider": {
55
+ "timeoutMs": 3600000,
56
+ "maxRetries": max_retries,
57
+ "maxRetryDelayMs": max_delay_ms,
58
+ },
59
+ }
60
+
61
+
62
+ PROVIDER_LLAMA = "llama-cpp"
63
+ PROVIDER_GEMINI = "google-gemini"
64
+ PROVIDER_BEDROCK = "amazon-bedrock"
65
+
66
+ PROVIDER_LABELS: dict[str, str] = {
67
+ PROVIDER_LLAMA: "Local (llama-cpp)",
68
+ PROVIDER_GEMINI: "Gemini",
69
+ PROVIDER_BEDROCK: "AWS Bedrock",
70
+ }
71
+
72
+
73
+ def is_hf_space_profile() -> bool:
74
+ profile = os.environ.get("PI_DEPLOYMENT_PROFILE", DEPLOYMENT_LOCAL).strip().lower()
75
+ return profile == DEPLOYMENT_HF_SPACE
76
+
77
+
78
+ LLAMA_BASE_URL = os.environ.get("PI_LLAMA_BASE_URL", "http://llama-inference:8080/v1")
79
+ LLAMA_MODEL_ID = os.environ.get("PI_LLAMA_MODEL_ID", "unsloth/Qwen3.6-27B-MTP-GGUF")
80
+ LLAMA_CONTEXT = int(os.environ.get("PI_LLAMA_CONTEXT_WINDOW", "114688"))
81
+ LLAMA_MAX_TOKENS = int(os.environ.get("PI_LLAMA_MAX_TOKENS", "32768"))
82
+
83
+ GEMINI_MODELS: tuple[tuple[str, str, int, bool], ...] = (
84
+ ("gemini-flash-lite-latest", "Gemini Flash Lite", 1048576, False),
85
+ ("gemini-flash-latest", "Gemini Flash", 1048576, True),
86
+ ("gemini-pro-latest", "Gemini Pro", 1048576, True),
87
+ )
88
+
89
+ BEDROCK_MODELS: tuple[tuple[str, str, int, bool], ...] = (
90
+ (
91
+ "anthropic.claude-3-haiku-20240307-v1:0",
92
+ "Claude 3 Haiku (Bedrock)",
93
+ 200000,
94
+ False,
95
+ ),
96
+ (
97
+ "anthropic.claude-3-7-sonnet-20250219-v1:0",
98
+ "Claude 3.7 Sonnet (Bedrock)",
99
+ 200000,
100
+ True,
101
+ ),
102
+ (
103
+ "anthropic.claude-sonnet-4-5-20250929-v1:0",
104
+ "Claude Sonnet 4.5 (Bedrock)",
105
+ 200000,
106
+ True,
107
+ ),
108
+ ("anthropic.claude-sonnet-4-6", "Claude Sonnet 4.6 (Bedrock)", 200000, True),
109
+ ("amazon.nova-micro-v1:0", "Amazon Nova Micro (Bedrock)", 128000, False),
110
+ ("amazon.nova-lite-v1:0", "Amazon Nova Lite (Bedrock)", 300000, False),
111
+ ("amazon.nova-pro-v1:0", "Amazon Nova Pro (Bedrock)", 300000, False),
112
+ )
113
+
114
+ PROVIDER_MODELS: dict[str, list[str]] = {
115
+ PROVIDER_LLAMA: [LLAMA_MODEL_ID],
116
+ PROVIDER_GEMINI: [model_id for model_id, _, _, _ in GEMINI_MODELS],
117
+ PROVIDER_BEDROCK: [model_id for model_id, _, _, _ in BEDROCK_MODELS],
118
+ }
119
+
120
+ DEFAULT_MODEL_BY_PROVIDER: dict[str, str] = {
121
+ PROVIDER_LLAMA: LLAMA_MODEL_ID,
122
+ PROVIDER_GEMINI: GEMINI_MODELS[0][0], # Gemini Flash Lite
123
+ PROVIDER_BEDROCK: "anthropic.claude-sonnet-4-6",
124
+ }
125
+
126
+
127
+ def get_default_provider() -> str:
128
+ """Current default Pi provider (reads ``PI_DEFAULT_PROVIDER`` from env each call)."""
129
+ if is_hf_space_profile():
130
+ return PROVIDER_GEMINI
131
+ raw = (os.environ.get("PI_DEFAULT_PROVIDER") or PROVIDER_LLAMA).strip()
132
+ if raw in PROVIDER_MODELS:
133
+ return raw
134
+ return PROVIDER_LLAMA
135
+
136
+
137
+ DEFAULT_PROVIDER = get_default_provider()
138
+
139
+ _env_default_model = (os.environ.get("PI_DEFAULT_MODEL") or "").strip()
140
+ DEFAULT_MODEL = _env_default_model or DEFAULT_MODEL_BY_PROVIDER.get(
141
+ DEFAULT_PROVIDER, LLAMA_MODEL_ID
142
+ )
143
+
144
+
145
+ def resolved_default_model(provider: str, *, override: str | None = None) -> str:
146
+ """
147
+ Pick the default model id for a provider.
148
+
149
+ Order: explicit override → ``PI_DEFAULT_MODEL`` (if listed for provider) →
150
+ built-in per-provider default.
151
+ """
152
+ models = PROVIDER_MODELS.get(provider, [])
153
+ if override and override in models:
154
+ return override
155
+ env_model = (os.environ.get("PI_DEFAULT_MODEL") or DEFAULT_MODEL or "").strip()
156
+ if env_model and env_model in models:
157
+ return env_model
158
+ return DEFAULT_MODEL_BY_PROVIDER.get(provider, LLAMA_MODEL_ID)
159
+
160
+
161
+ def _zero_cost() -> dict[str, int]:
162
+ return {"input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0}
163
+
164
+
165
+ def _model_entry(
166
+ model_id: str,
167
+ name: str,
168
+ *,
169
+ context_window: int,
170
+ max_tokens: int,
171
+ reasoning: bool,
172
+ image_input: bool = True,
173
+ ) -> dict[str, Any]:
174
+ inputs = ["text", "image"] if image_input else ["text"]
175
+ return {
176
+ "id": model_id,
177
+ "name": name,
178
+ "reasoning": reasoning,
179
+ "input": inputs,
180
+ "contextWindow": context_window,
181
+ "maxTokens": max_tokens,
182
+ "cost": _zero_cost(),
183
+ }
184
+
185
+
186
+ def _llama_provider() -> dict[str, Any]:
187
+ return {
188
+ "baseUrl": LLAMA_BASE_URL,
189
+ "api": "openai-completions",
190
+ "apiKey": "llama-cpp",
191
+ "compat": {
192
+ "supportsDeveloperRole": False,
193
+ "supportsReasoningEffort": False,
194
+ "supportsUsageInStreaming": False,
195
+ "maxTokensField": "max_tokens",
196
+ },
197
+ "models": [
198
+ _model_entry(
199
+ LLAMA_MODEL_ID,
200
+ "Qwen 3.6 27B (local)",
201
+ context_window=LLAMA_CONTEXT,
202
+ max_tokens=LLAMA_MAX_TOKENS,
203
+ reasoning=False,
204
+ )
205
+ ],
206
+ }
207
+
208
+
209
+ def _gemini_provider() -> dict[str, Any]:
210
+ return {
211
+ "baseUrl": "https://generativelanguage.googleapis.com/v1beta",
212
+ "api": "google-generative-ai",
213
+ "apiKey": "GEMINI_API_KEY",
214
+ "models": [
215
+ _model_entry(
216
+ model_id, name, context_window=ctx, max_tokens=8192, reasoning=reasoning
217
+ )
218
+ for model_id, name, ctx, reasoning in GEMINI_MODELS
219
+ ],
220
+ }
221
+
222
+
223
+ def _bedrock_region() -> str:
224
+ return (
225
+ os.environ.get("AWS_REGION")
226
+ or os.environ.get("AWS_DEFAULT_REGION")
227
+ or "eu-west-2"
228
+ )
229
+
230
+
231
+ _AWS_CREDENTIAL_ENV_KEYS: tuple[str, ...] = (
232
+ "AWS_ACCESS_KEY_ID",
233
+ "AWS_SECRET_ACCESS_KEY",
234
+ "AWS_SESSION_TOKEN",
235
+ "AWS_ACCESS_KEY",
236
+ "AWS_SECRET_KEY",
237
+ )
238
+ _AWS_PROFILE_ENV_KEYS: tuple[str, ...] = ("AWS_PROFILE", "PI_AWS_PROFILE")
239
+
240
+
241
+ def _env_flag(name: str, *, default: bool = False) -> bool:
242
+ raw = os.environ.get(name)
243
+ if raw is None:
244
+ return default
245
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
246
+
247
+
248
+ def _strip_empty_env_vars(names: tuple[str, ...]) -> None:
249
+ for name in names:
250
+ if not (os.environ.get(name) or "").strip():
251
+ os.environ.pop(name, None)
252
+
253
+
254
+ def _mirror_legacy_aws_key_env_vars() -> None:
255
+ if not (os.environ.get("AWS_ACCESS_KEY_ID") or "").strip():
256
+ legacy = (os.environ.get("AWS_ACCESS_KEY") or "").strip()
257
+ if legacy:
258
+ os.environ["AWS_ACCESS_KEY_ID"] = legacy
259
+ if not (os.environ.get("AWS_SECRET_ACCESS_KEY") or "").strip():
260
+ legacy = (os.environ.get("AWS_SECRET_KEY") or "").strip()
261
+ if legacy:
262
+ os.environ["AWS_SECRET_ACCESS_KEY"] = legacy
263
+
264
+
265
+ def _has_explicit_aws_access_keys() -> bool:
266
+ access = (
267
+ os.environ.get("AWS_ACCESS_KEY_ID") or os.environ.get("AWS_ACCESS_KEY") or ""
268
+ ).strip()
269
+ secret = (
270
+ os.environ.get("AWS_SECRET_ACCESS_KEY")
271
+ or os.environ.get("AWS_SECRET_KEY")
272
+ or ""
273
+ ).strip()
274
+ return bool(access and secret)
275
+
276
+
277
+ def _aws_config_path() -> Path | None:
278
+ explicit = (os.environ.get("AWS_CONFIG_FILE") or "").strip()
279
+ if explicit:
280
+ path = Path(explicit).expanduser()
281
+ return path if path.is_file() else None
282
+ home = Path(os.environ.get("HOME", "/home/node"))
283
+ path = home / ".aws" / "config"
284
+ return path if path.is_file() else None
285
+
286
+
287
+ def _discover_aws_profile_from_config() -> str | None:
288
+ """Return an AWS profile name for Pi/Bedrock when only ~/.aws is mounted."""
289
+ explicit = (os.environ.get("PI_AWS_PROFILE") or "").strip()
290
+ if not explicit:
291
+ explicit = (os.environ.get("AWS_PROFILE") or "").strip()
292
+ if explicit:
293
+ return explicit
294
+
295
+ path = _aws_config_path()
296
+ if not path:
297
+ return None
298
+
299
+ current_profile: str | None = None
300
+ sso_profiles: list[str] = []
301
+ all_profiles: list[str] = []
302
+
303
+ for raw_line in path.read_text(encoding="utf-8").splitlines():
304
+ line = raw_line.strip()
305
+ if not line or line.startswith("#") or line.startswith(";"):
306
+ continue
307
+ if line == "[default]":
308
+ current_profile = "default"
309
+ all_profiles.append("default")
310
+ continue
311
+ if line.startswith("[profile ") and line.endswith("]"):
312
+ current_profile = line[len("[profile ") : -1].strip()
313
+ if current_profile:
314
+ all_profiles.append(current_profile)
315
+ continue
316
+ if current_profile and line.startswith("sso_session"):
317
+ sso_profiles.append(current_profile)
318
+
319
+ if sso_profiles:
320
+ return sso_profiles[0]
321
+ if "default" in all_profiles:
322
+ return "default"
323
+ return all_profiles[0] if all_profiles else None
324
+
325
+
326
+ def _region_from_aws_config(profile: str | None = None) -> str | None:
327
+ """Read ``region =`` from a profile block in ``~/.aws/config``."""
328
+ path = _aws_config_path()
329
+ if not path:
330
+ return None
331
+
332
+ target = (profile or _discover_aws_profile_from_config() or "").strip()
333
+ if not target:
334
+ return None
335
+
336
+ current_profile: str | None = None
337
+ for raw_line in path.read_text(encoding="utf-8").splitlines():
338
+ line = raw_line.strip()
339
+ if not line or line.startswith("#") or line.startswith(";"):
340
+ continue
341
+ if line == "[default]":
342
+ current_profile = "default"
343
+ continue
344
+ if line.startswith("[profile ") and line.endswith("]"):
345
+ current_profile = line[len("[profile ") : -1].strip()
346
+ continue
347
+ if current_profile != target:
348
+ continue
349
+ if line.startswith("region"):
350
+ _, _, value = line.partition("=")
351
+ region = value.strip()
352
+ if region:
353
+ return region
354
+ return None
355
+
356
+
357
+ def _ensure_aws_region_env() -> None:
358
+ """Ensure AWS SDK env has a non-empty region (profile config, then eu-west-2)."""
359
+ _strip_empty_env_vars(("AWS_REGION", "AWS_DEFAULT_REGION"))
360
+ region = (
361
+ os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") or ""
362
+ ).strip()
363
+ if not region:
364
+ profile = (os.environ.get("AWS_PROFILE") or "").strip()
365
+ region = (_region_from_aws_config(profile) or "").strip()
366
+ if not region:
367
+ region = _bedrock_region()
368
+ os.environ["AWS_REGION"] = region
369
+ os.environ["AWS_DEFAULT_REGION"] = region
370
+
371
+
372
+ def _pi_bedrock_auth_visible() -> bool:
373
+ """True when Pi's amazon-bedrock provider would detect configured auth."""
374
+ if (os.environ.get("AWS_PROFILE") or "").strip():
375
+ return True
376
+ if _has_explicit_aws_access_keys():
377
+ return True
378
+ if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
379
+ return True
380
+ return False
381
+
382
+
383
+ def _ensure_pi_bedrock_auth_env() -> None:
384
+ """
385
+ Pi checks env vars (not ~/.aws alone) before Bedrock is usable.
386
+
387
+ When SSO credentials live in a mounted ``~/.aws`` tree, set ``AWS_PROFILE``
388
+ so Pi passes its auth preflight and the AWS SDK loads the profile.
389
+ """
390
+ if _pi_bedrock_auth_visible():
391
+ return
392
+ profile = _discover_aws_profile_from_config()
393
+ if profile:
394
+ os.environ["AWS_PROFILE"] = profile
395
+
396
+
397
+ def configure_aws_credentials(
398
+ *,
399
+ session_access_key_id: str | None = None,
400
+ session_secret_access_key: str | None = None,
401
+ session_session_token: str | None = None,
402
+ ) -> None:
403
+ """
404
+ Align Pi Bedrock AWS env with doc_redaction SSO/key priority.
405
+
406
+ Mirrors ``tools/file_redaction.py``: when ``RUN_AWS_FUNCTIONS`` is enabled,
407
+ prefer the default credential chain (SSO profile, instance role, etc.) over
408
+ static env keys when ``PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS`` is true.
409
+ Explicit UI session keys from **Apply backend** always win.
410
+ """
411
+ _strip_empty_env_vars(_AWS_CREDENTIAL_ENV_KEYS)
412
+ _strip_empty_env_vars(_AWS_PROFILE_ENV_KEYS)
413
+ _mirror_legacy_aws_key_env_vars()
414
+
415
+ session_explicit = bool(
416
+ session_access_key_id
417
+ and session_access_key_id.strip()
418
+ and session_secret_access_key
419
+ and session_secret_access_key.strip()
420
+ )
421
+ if session_explicit:
422
+ os.environ["AWS_ACCESS_KEY_ID"] = session_access_key_id.strip()
423
+ os.environ["AWS_SECRET_ACCESS_KEY"] = session_secret_access_key.strip()
424
+ if session_session_token and session_session_token.strip():
425
+ os.environ["AWS_SESSION_TOKEN"] = session_session_token.strip()
426
+ else:
427
+ os.environ.pop("AWS_SESSION_TOKEN", None)
428
+ _ensure_aws_region_env()
429
+ return
430
+
431
+ run_aws = _env_flag("RUN_AWS_FUNCTIONS")
432
+ prioritise_sso = _env_flag("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", default=True)
433
+
434
+ if run_aws and prioritise_sso:
435
+ for key in _AWS_CREDENTIAL_ENV_KEYS:
436
+ os.environ.pop(key, None)
437
+ _ensure_pi_bedrock_auth_env()
438
+ elif run_aws:
439
+ for key in _AWS_CREDENTIAL_ENV_KEYS:
440
+ os.environ.pop(key, None)
441
+ _ensure_pi_bedrock_auth_env()
442
+
443
+ # Propagate PI_AWS_PROFILE when only that alias is set (e.g. pi_agent.env).
444
+ pi_profile = (os.environ.get("PI_AWS_PROFILE") or "").strip()
445
+ if pi_profile and not (os.environ.get("AWS_PROFILE") or "").strip():
446
+ os.environ["AWS_PROFILE"] = pi_profile
447
+
448
+ _ensure_aws_region_env()
449
+
450
+
451
+ def _aws_credential_status() -> str:
452
+ if _has_explicit_aws_access_keys():
453
+ return "access keys"
454
+ profile = (os.environ.get("AWS_PROFILE") or "").strip()
455
+ if profile:
456
+ return f"profile `{profile}`"
457
+ if (os.environ.get("AWS_BEARER_TOKEN_BEDROCK") or "").strip():
458
+ return "Bedrock bearer token"
459
+ if _aws_config_path():
460
+ return "SSO config mounted (profile not set)"
461
+ if _env_flag("RUN_AWS_FUNCTIONS"):
462
+ return "SSO/default chain (missing profile)"
463
+ return "missing"
464
+
465
+
466
+ def _bedrock_provider() -> dict[str, Any]:
467
+ region = _bedrock_region()
468
+ return {
469
+ "baseUrl": f"https://bedrock-runtime.{region}.amazonaws.com",
470
+ "api": "bedrock-converse-stream",
471
+ "models": [
472
+ _model_entry(
473
+ model_id,
474
+ name,
475
+ context_window=ctx,
476
+ max_tokens=8192,
477
+ reasoning=reasoning,
478
+ )
479
+ for model_id, name, ctx, reasoning in BEDROCK_MODELS
480
+ ],
481
+ }
482
+
483
+
484
+ def build_models_config() -> dict[str, Any]:
485
+ if is_hf_space_profile():
486
+ return {"providers": {PROVIDER_GEMINI: _gemini_provider()}}
487
+ return {
488
+ "providers": {
489
+ PROVIDER_LLAMA: _llama_provider(),
490
+ PROVIDER_GEMINI: _gemini_provider(),
491
+ PROVIDER_BEDROCK: _bedrock_provider(),
492
+ }
493
+ }
494
+
495
+
496
+ def _load_settings_template() -> dict[str, Any]:
497
+ if SETTINGS_TEMPLATE.is_file():
498
+ return json.loads(SETTINGS_TEMPLATE.read_text(encoding="utf-8"))
499
+ return {
500
+ "defaultThinkingLevel": "off",
501
+ "hideThinkingBlock": True,
502
+ "compaction": {
503
+ "enabled": True,
504
+ "reserveTokens": 32768,
505
+ "keepRecentTokens": 20000,
506
+ },
507
+ "enableSkillCommands": True,
508
+ "sessionDir": "sessions",
509
+ }
510
+
511
+
512
+ def _apply_compaction_settings(settings: dict[str, Any]) -> None:
513
+ """
514
+ Merge Pi session auto-compaction from env into ``settings.json``.
515
+
516
+ ``PI_COMPACTION_ENABLED`` — when set, overrides the template ``compaction.enabled``
517
+ flag (``true`` / ``false``). When unset, the template default applies (enabled).
518
+
519
+ Optional tuning: ``PI_COMPACTION_RESERVE_TOKENS``, ``PI_COMPACTION_KEEP_RECENT_TOKENS``.
520
+ """
521
+ compaction = dict(
522
+ settings.get("compaction")
523
+ or {
524
+ "enabled": True,
525
+ "reserveTokens": 32768,
526
+ "keepRecentTokens": 20000,
527
+ }
528
+ )
529
+ if os.environ.get("PI_COMPACTION_ENABLED") is not None:
530
+ compaction["enabled"] = _env_flag("PI_COMPACTION_ENABLED")
531
+ reserve = (os.environ.get("PI_COMPACTION_RESERVE_TOKENS") or "").strip()
532
+ if reserve:
533
+ compaction["reserveTokens"] = int(reserve)
534
+ keep = (os.environ.get("PI_COMPACTION_KEEP_RECENT_TOKENS") or "").strip()
535
+ if keep:
536
+ compaction["keepRecentTokens"] = int(keep)
537
+ settings["compaction"] = compaction
538
+
539
+
540
+ def resolve_session_dir() -> str:
541
+ """Pi session JSONL directory (absolute path or relative to ``AGENT_DIR``)."""
542
+ explicit = os.environ.get("PI_SESSION_DIR", "").strip()
543
+ if explicit:
544
+ return explicit
545
+ if is_hf_space_profile():
546
+ return "/tmp/pi-sessions"
547
+ return "sessions"
548
+
549
+
550
+ def ensure_session_dir(session_dir: str | None = None) -> Path:
551
+ """Create the Pi session directory and return its resolved absolute path."""
552
+ raw = (session_dir or resolve_session_dir()).strip()
553
+ path = Path(raw)
554
+ if not path.is_absolute():
555
+ path = (resolve_agent_dir() / path).resolve()
556
+ else:
557
+ path = path.resolve()
558
+ path.mkdir(parents=True, exist_ok=True)
559
+ return path
560
+
561
+
562
+ def build_settings_config(
563
+ *,
564
+ default_provider: str | None = None,
565
+ default_model: str | None = None,
566
+ ) -> dict[str, Any]:
567
+ provider = default_provider or get_default_provider()
568
+ if provider not in PROVIDER_MODELS:
569
+ provider = PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
570
+ model = resolved_default_model(provider, override=default_model)
571
+
572
+ settings = _load_settings_template()
573
+ settings["defaultProvider"] = provider
574
+ settings["defaultModel"] = model
575
+ _apply_compaction_settings(settings)
576
+ session_path = ensure_session_dir(resolve_session_dir())
577
+ settings["sessionDir"] = session_path.as_posix()
578
+ if is_hf_space_profile() or provider == PROVIDER_GEMINI:
579
+ _apply_retry_settings(settings, provider=provider)
580
+ from pi_workspace_skills import ensure_workspace_skills, workspace_skills_dir
581
+
582
+ ensure_workspace_skills()
583
+ settings["skills"] = [workspace_skills_dir().as_posix()]
584
+ return settings
585
+
586
+
587
+ def write_runtime_config(
588
+ *,
589
+ agent_dir: Path | None = None,
590
+ default_provider: str | None = None,
591
+ default_model: str | None = None,
592
+ ) -> tuple[Path, Path]:
593
+ """Write models.json and settings.json; return their paths."""
594
+ target = Path(agent_dir or resolve_agent_dir())
595
+ target.mkdir(parents=True, exist_ok=True)
596
+
597
+ models_path = target / "models.json"
598
+ settings_path = target / "settings.json"
599
+
600
+ models_path.write_text(
601
+ json.dumps(build_models_config(), indent=2) + "\n",
602
+ encoding="utf-8",
603
+ )
604
+ settings_path.write_text(
605
+ json.dumps(
606
+ build_settings_config(
607
+ default_provider=default_provider,
608
+ default_model=default_model,
609
+ ),
610
+ indent=2,
611
+ )
612
+ + "\n",
613
+ encoding="utf-8",
614
+ )
615
+ return models_path, settings_path
616
+
617
+
618
+ def models_for_provider(provider: str) -> list[str]:
619
+ if is_hf_space_profile():
620
+ return list(PROVIDER_MODELS[PROVIDER_GEMINI])
621
+ return list(PROVIDER_MODELS.get(provider, PROVIDER_MODELS[PROVIDER_LLAMA]))
622
+
623
+
624
+ def default_model_for_provider(provider: str) -> str:
625
+ return resolved_default_model(provider)
626
+
627
+
628
+ def normalize_provider(provider: str) -> str:
629
+ label_map = {label.lower(): key for key, label in PROVIDER_LABELS.items()}
630
+ lowered = (provider or "").strip().lower()
631
+ if lowered in PROVIDER_MODELS:
632
+ return lowered
633
+ if lowered in label_map:
634
+ return label_map[lowered]
635
+ return PROVIDER_GEMINI if is_hf_space_profile() else PROVIDER_LLAMA
636
+
637
+
638
+ def apply_session_credentials(
639
+ *,
640
+ gemini_api_key: str | None = None,
641
+ hf_token: str | None = None,
642
+ aws_region: str | None = None,
643
+ aws_access_key_id: str | None = None,
644
+ aws_secret_access_key: str | None = None,
645
+ aws_session_token: str | None = None,
646
+ ) -> None:
647
+ """Apply session-only credential overrides to os.environ."""
648
+ if gemini_api_key and gemini_api_key.strip():
649
+ os.environ["GEMINI_API_KEY"] = gemini_api_key.strip()
650
+ if hf_token and hf_token.strip():
651
+ token = hf_token.strip()
652
+ os.environ["HF_TOKEN"] = token
653
+ os.environ["DOC_REDACTION_HF_TOKEN"] = token
654
+ if aws_region and aws_region.strip():
655
+ os.environ["AWS_REGION"] = aws_region.strip()
656
+ os.environ["AWS_DEFAULT_REGION"] = aws_region.strip()
657
+ configure_aws_credentials(
658
+ session_access_key_id=aws_access_key_id,
659
+ session_secret_access_key=aws_secret_access_key,
660
+ session_session_token=aws_session_token,
661
+ )
662
+
663
+
664
+ def mirror_hf_token_from_env() -> None:
665
+ """Mirror DOC_REDACTION_HF_TOKEN or Space secret HF_TOKEN for Pi subprocess."""
666
+ if os.environ.get("HF_TOKEN"):
667
+ return
668
+ doc_token = os.environ.get("DOC_REDACTION_HF_TOKEN", "").strip()
669
+ if doc_token:
670
+ os.environ["HF_TOKEN"] = doc_token
671
+
672
+
673
+ def _hf_token_status() -> str:
674
+ if os.environ.get("HF_TOKEN"):
675
+ source = (
676
+ "UI session" if os.environ.get("_HF_TOKEN_FROM_UI") else "env/Space secret"
677
+ )
678
+ return f"set ({source})"
679
+ return "missing"
680
+
681
+
682
+ def credential_status_markdown() -> str:
683
+ gemini = (
684
+ "set"
685
+ if os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
686
+ else "missing"
687
+ )
688
+ lines = [f"**Credentials:** Gemini `{gemini}`"]
689
+ if is_hf_space_profile():
690
+ lines.append(f"HF token (redaction backend) `{_hf_token_status()}`")
691
+ else:
692
+ region = _bedrock_region()
693
+ lines.append(f"AWS `{_aws_credential_status()}` · region `{region}`")
694
+ return " · ".join(lines)
695
+
696
+
697
+ def provider_choices() -> list[str]:
698
+ if is_hf_space_profile():
699
+ return [PROVIDER_GEMINI]
700
+ return list(PROVIDER_LABELS.keys())
701
+
702
+
703
+ def gemini_api_key_configured() -> bool:
704
+ return bool(os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"))
705
+
706
+
707
+ def provider_label(provider: str) -> str:
708
+ return PROVIDER_LABELS.get(provider, provider)
709
+
710
+
711
+ if __name__ == "__main__":
712
+ configure_aws_credentials()
713
+ models_path, settings_path = write_runtime_config()
714
+ print(f"Wrote {models_path}")
715
+ print(f"Wrote {settings_path}")
agent-redact/pi/pi_examples.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pi agent Gradio examples aligned with the main app SHOW_EXAMPLES redaction demos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ from pi_agent_config import is_hf_space_profile
10
+ from redaction_prompt import HF_DEFAULT_OCR
11
+
12
+
13
+ def _show_examples_from_env() -> bool:
14
+ """True unless PI_GRADIO_SHOW_EXAMPLES or SHOW_PI_EXAMPLES is explicitly false."""
15
+ for key in ("PI_GRADIO_SHOW_EXAMPLES", "SHOW_PI_EXAMPLES"):
16
+ raw = os.environ.get(key)
17
+ if raw is None:
18
+ continue
19
+ lowered = raw.strip().lower()
20
+ if lowered in {"0", "false", "no"}:
21
+ return False
22
+ if lowered in {"1", "true", "yes"}:
23
+ return True
24
+ return True
25
+
26
+
27
+ SHOW_PI_EXAMPLES = _show_examples_from_env()
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class PiRedactionExample:
32
+ label: str
33
+ file_name: str
34
+ instructions: str
35
+ ocr_method: str
36
+ pii_method: str = "Local"
37
+ encourage_vlm_faces: bool = False
38
+ encourage_vlm_signatures: bool = False
39
+ page_range: str = "all"
40
+
41
+
42
+ def resolve_example_data_dir() -> Path | None:
43
+ """Locate bundled example PDFs (repo checkout, PyPI package, or Docker layout)."""
44
+ from bootstrap_pi_config import pi_repo_root_path
45
+
46
+ workdir = pi_repo_root_path()
47
+ repo_root = Path(__file__).resolve().parents[2]
48
+ candidates = [
49
+ workdir / "doc_redaction" / "example_data",
50
+ workdir / "example_data",
51
+ repo_root / "doc_redaction" / "example_data",
52
+ repo_root / "example_data",
53
+ ]
54
+
55
+ for candidate in candidates:
56
+ if candidate.is_dir():
57
+ return candidate.resolve()
58
+ return None
59
+
60
+
61
+ def example_file_path(file_name: str) -> Path | None:
62
+ root = resolve_example_data_dir()
63
+ if root is None:
64
+ return None
65
+ path = (root / file_name).resolve()
66
+ try:
67
+ path.relative_to(root)
68
+ except ValueError:
69
+ return None
70
+ if not path.is_file():
71
+ return None
72
+ if _is_lfs_pointer(path):
73
+ return None
74
+ return path
75
+
76
+
77
+ def _is_lfs_pointer(path: Path) -> bool:
78
+ try:
79
+ first_line = path.read_text(encoding="utf-8", errors="ignore").splitlines()[0]
80
+ except (OSError, IndexError):
81
+ return False
82
+ return first_line.startswith("version https://git-lfs.github.com/spec/v1")
83
+
84
+
85
+ def _catalog() -> tuple[PiRedactionExample, ...]:
86
+ selectable_text_ocr = (
87
+ HF_DEFAULT_OCR if is_hf_space_profile() else "Local model - selectable text"
88
+ )
89
+ # local_ocr = (
90
+ # HF_DEFAULT_OCR
91
+ # if is_hf_space_profile()
92
+ # else "Local OCR model - PDFs without selectable text"
93
+ # )
94
+ return (
95
+ PiRedactionExample(
96
+ label="Emails to a professor",
97
+ file_name="example_of_emails_sent_to_a_professor_before_applying.pdf",
98
+ ocr_method=selectable_text_ocr,
99
+ pii_method="Local",
100
+ instructions=(
101
+ "- Any redaction box related to Dr Kornbluth should be removed\n"
102
+ "- References to Dr Hyde, or Dr Hyde's lab should be redacted. Also any references to Lauren, or Lauren Lilley\n"
103
+ "- All mentions of Universities and their names should be redacted\n"
104
+ ),
105
+ ),
106
+ PiRedactionExample(
107
+ label="Graduate cover letter",
108
+ file_name="graduate-job-example-cover-letter.pdf",
109
+ ocr_method=selectable_text_ocr,
110
+ pii_method="Local",
111
+ instructions=(
112
+ "- Redact any names and titles, apart from Mr Wilson\n"
113
+ "- Redact any organisation names\n"
114
+ "- Redact any place names\n"
115
+ ),
116
+ ),
117
+ )
118
+
119
+
120
+ def available_pi_examples() -> list[PiRedactionExample]:
121
+ if not SHOW_PI_EXAMPLES:
122
+ return []
123
+ available: list[PiRedactionExample] = []
124
+ for example in _catalog():
125
+ if example_file_path(example.file_name) is not None:
126
+ available.append(example)
127
+ return available
128
+
129
+
130
+ def example_rows() -> tuple[list[list], list[str]]:
131
+ """Return (gr.Examples rows, labels) for available demos."""
132
+ rows: list[list] = []
133
+ labels: list[str] = []
134
+ for example in available_pi_examples():
135
+ path = example_file_path(example.file_name)
136
+ if path is None:
137
+ continue
138
+ rows.append(
139
+ [
140
+ str(path),
141
+ example.instructions,
142
+ example.page_range,
143
+ example.ocr_method,
144
+ example.pii_method,
145
+ example.encourage_vlm_faces,
146
+ example.encourage_vlm_signatures,
147
+ ]
148
+ )
149
+ labels.append(example.label)
150
+ return rows, labels
151
+
152
+
153
+ def gradio_example_allowed_paths() -> list[str]:
154
+ root = resolve_example_data_dir()
155
+ if root is None:
156
+ return []
157
+ return [str(root)]
158
+
159
+
160
+ def examples_status_markdown() -> str:
161
+ """Human-readable status for the UI when examples are missing or disabled."""
162
+ if not SHOW_PI_EXAMPLES:
163
+ return (
164
+ "_Examples are disabled. Set Space variable "
165
+ "`PI_GRADIO_SHOW_EXAMPLES=true` (or `SHOW_PI_EXAMPLES=true`) and restart._"
166
+ )
167
+ root = resolve_example_data_dir()
168
+ if root is None:
169
+ return (
170
+ "_Example PDFs not found — expected under "
171
+ "`doc_redaction/example_data/` in the Space image._"
172
+ )
173
+ available = available_pi_examples()
174
+ if not available:
175
+ return (
176
+ f"_Example PDFs not found under `{root}`. "
177
+ "Rebuild the Space after syncing example files from the monorepo._"
178
+ )
179
+ names = ", ".join(f"`{ex.file_name}`" for ex in available)
180
+ return f"_Examples loaded from `{root}`: {names}_"
agent-redact/pi/pi_rpc_client.py ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Python client for Pi RPC mode (JSONL over stdin/stdout)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import shutil
8
+ import subprocess
9
+ import threading
10
+ import uuid
11
+ from collections.abc import Iterator
12
+ from dataclasses import dataclass, field
13
+ from typing import Any
14
+
15
+
16
+ class PiRpcError(RuntimeError):
17
+ pass
18
+
19
+
20
+ # Pi RPC is JSONL over pipes; always UTF-8 (Windows default locale is cp1252).
21
+ _PI_SUBPROCESS_ENCODING = "utf-8"
22
+ _PI_SUBPROCESS_ENCODING_ERRORS = "replace"
23
+
24
+ _PI_INSTALL_HINT = (
25
+ "Install the Pi coding agent CLI, then restart the Gradio app: \n"
26
+ "`npm install -g @earendil-works/pi-coding-agent` \n"
27
+ "On Windows, ensure Node.js/npm are on PATH (or set `PI_EXECUTABLE` to the "
28
+ "full path to `pi.cmd`, e.g. `%APPDATA%\\npm\\pi.cmd`). \n"
29
+ "Docker users: run the Pi UI via `docker compose` (`pi-agent` service) instead "
30
+ "of `python gradio_app.py` on the host."
31
+ )
32
+
33
+
34
+ def resolve_pi_executable() -> str:
35
+ """Return a path to the ``pi`` RPC executable (raises ``PiRpcError`` if missing)."""
36
+ override = os.environ.get("PI_EXECUTABLE", "").strip()
37
+ if override:
38
+ if os.path.isfile(override) or shutil.which(override):
39
+ return override
40
+ raise PiRpcError(
41
+ f"PI_EXECUTABLE is set but not found: `{override}` \n\n{_PI_INSTALL_HINT}"
42
+ )
43
+ for name in ("pi", "pi.cmd"):
44
+ found = shutil.which(name)
45
+ if found:
46
+ return found
47
+ raise PiRpcError(f"Pi CLI (`pi`) not found on PATH. \n\n{_PI_INSTALL_HINT}")
48
+
49
+
50
+ @dataclass
51
+ class PiStreamEvent:
52
+ """Structured event from Pi RPC for UI layers."""
53
+
54
+ kind: str
55
+ text: str = ""
56
+ tool_name: str | None = None
57
+ tool_call_id: str | None = None
58
+ tool_args: dict[str, Any] | None = None
59
+ tool_output: str | None = None
60
+ is_error: bool = False
61
+ meta: dict[str, Any] = field(default_factory=dict)
62
+
63
+
64
+ def extract_tool_text(payload: dict[str, Any] | None) -> str:
65
+ if not payload:
66
+ return ""
67
+ content = payload.get("content")
68
+ if content is None and isinstance(payload.get("partialResult"), dict):
69
+ content = payload["partialResult"].get("content")
70
+ if content is None and isinstance(payload.get("result"), dict):
71
+ content = payload["result"].get("content")
72
+ if not isinstance(content, list):
73
+ return ""
74
+ parts: list[str] = []
75
+ for block in content:
76
+ if isinstance(block, dict) and block.get("type") == "text":
77
+ parts.append(str(block.get("text") or ""))
78
+ return "\n".join(parts).strip()
79
+
80
+
81
+ def extract_assistant_display(message: dict[str, Any] | None) -> tuple[str, str]:
82
+ """Extract visible text and thinking from a partial assistant message."""
83
+ if not message or message.get("role") != "assistant":
84
+ return "", ""
85
+ content = message.get("content")
86
+ if isinstance(content, str):
87
+ return content, ""
88
+ if not isinstance(content, list):
89
+ return "", ""
90
+
91
+ texts: list[str] = []
92
+ thinkings: list[str] = []
93
+ for block in content:
94
+ if isinstance(block, str):
95
+ if block.strip():
96
+ texts.append(block)
97
+ continue
98
+ if not isinstance(block, dict):
99
+ continue
100
+ block_type = block.get("type")
101
+ if block_type in (None, "text", "output_text"):
102
+ text = block.get("text") or block.get("content") or ""
103
+ if text:
104
+ texts.append(str(text))
105
+ elif block_type in ("thinking", "reasoning", "thought"):
106
+ thought = (
107
+ block.get("thinking")
108
+ or block.get("text")
109
+ or block.get("reasoning")
110
+ or block.get("content")
111
+ or ""
112
+ )
113
+ if thought:
114
+ thinkings.append(str(thought))
115
+ return "".join(texts), "".join(thinkings)
116
+
117
+
118
+ def assistant_chat_text(visible: str, thinking: str) -> str:
119
+ """Text to show in the main chat — visible answer, or thinking when Gemini sends only that."""
120
+ if visible.strip():
121
+ return visible
122
+ return thinking
123
+
124
+
125
+ def _tool_lines_from_content(content: list[Any]) -> list[str]:
126
+ tool_lines: list[str] = []
127
+ for block in content:
128
+ if not isinstance(block, dict):
129
+ continue
130
+ block_type = block.get("type")
131
+ if block_type not in {"toolCall", "tool_use", "functionCall"}:
132
+ continue
133
+ name = str(block.get("name") or block.get("toolName") or "tool")
134
+ args = block.get("arguments") or block.get("input") or block.get("args")
135
+ if isinstance(args, str):
136
+ try:
137
+ args = json.loads(args)
138
+ except json.JSONDecodeError:
139
+ args = {"raw": args}
140
+ if not isinstance(args, dict):
141
+ args = {}
142
+ tool_lines.append(f"**{name}:** {format_tool_args(name, args)}")
143
+ return tool_lines
144
+
145
+
146
+ def format_assistant_message_for_chat(message: dict[str, Any]) -> str:
147
+ """Render one assistant message for the chat UI (visible text or tool calls; no thinking)."""
148
+ visible, _thinking = extract_assistant_display(message)
149
+ if visible.strip():
150
+ return visible
151
+
152
+ content = message.get("content")
153
+ if not isinstance(content, list):
154
+ return ""
155
+
156
+ return "\n".join(_tool_lines_from_content(content))
157
+
158
+
159
+ def chat_text_from_assistant_message(message: dict[str, Any] | None) -> str:
160
+ """Non-thinking chat text from a Pi/Gemini assistant message snapshot."""
161
+ if not message or message.get("role") != "assistant":
162
+ return ""
163
+ return format_assistant_message_for_chat(message)
164
+
165
+
166
+ _RATE_LIMIT_MARKERS = (
167
+ "429",
168
+ "quota",
169
+ "rate limit",
170
+ "rate-limit",
171
+ "resource_exhausted",
172
+ "too many requests",
173
+ )
174
+
175
+
176
+ def is_rate_limit_error(text: str | None) -> bool:
177
+ """True when *text* looks like a provider quota or rate-limit failure."""
178
+ if not text:
179
+ return False
180
+ lowered = text.lower()
181
+ return any(marker in lowered for marker in _RATE_LIMIT_MARKERS)
182
+
183
+
184
+ def last_assistant_turn_error(messages: list[dict[str, Any]]) -> str | None:
185
+ """Return the latest assistant error in the current user turn, if any."""
186
+ last_user = -1
187
+ for index, message in enumerate(messages):
188
+ if message.get("role") == "user":
189
+ last_user = index
190
+
191
+ turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
192
+ for message in reversed(turn_messages):
193
+ if message.get("role") != "assistant":
194
+ continue
195
+ error = message.get("errorMessage")
196
+ if error:
197
+ return str(error)
198
+ if message.get("stopReason") == "error":
199
+ visible, _ = extract_assistant_display(message)
200
+ if visible.strip():
201
+ return visible
202
+ return "assistant turn failed"
203
+ return None
204
+
205
+
206
+ def assistant_text_since_last_user(messages: list[dict[str, Any]]) -> str:
207
+ """Combine assistant messages from the latest user turn."""
208
+ last_user = -1
209
+ for index, message in enumerate(messages):
210
+ if message.get("role") == "user":
211
+ last_user = index
212
+
213
+ turn_messages = messages[last_user + 1 :] if last_user >= 0 else messages
214
+ parts: list[str] = []
215
+ for message in turn_messages:
216
+ if message.get("role") != "assistant":
217
+ continue
218
+ part = format_assistant_message_for_chat(message)
219
+ if part.strip():
220
+ parts.append(part)
221
+ return "\n\n".join(parts)
222
+
223
+
224
+ def partial_message_from_update(event: dict[str, Any]) -> dict[str, Any] | None:
225
+ delta = event.get("assistantMessageEvent") or {}
226
+ partial = delta.get("partial")
227
+ if isinstance(partial, dict):
228
+ return partial
229
+ message = event.get("message")
230
+ if isinstance(message, dict):
231
+ return message
232
+ return None
233
+
234
+
235
+ def format_tool_args(tool_name: str | None, args: dict[str, Any] | None) -> str:
236
+ if not args:
237
+ return ""
238
+ name = (tool_name or "").lower()
239
+ if name == "bash" and args.get("command"):
240
+ cmd = str(args["command"]).replace("\n", " ↵ ")
241
+ return f"`{cmd[:240]}{'…' if len(cmd) > 240 else ''}`"
242
+ if name in {"read", "write", "edit"} and args.get("path"):
243
+ return f"`{args['path']}`"
244
+ compact = json.dumps(args, ensure_ascii=False)
245
+ if len(compact) > 280:
246
+ compact = compact[:277] + "…"
247
+ return compact
248
+
249
+
250
+ class PiRpcClient:
251
+ """Drive a long-lived ``pi --mode rpc`` subprocess."""
252
+
253
+ def __init__(
254
+ self,
255
+ *,
256
+ cwd: str | None = None,
257
+ env: dict[str, str] | None = None,
258
+ pi_args: list[str] | None = None,
259
+ ) -> None:
260
+ self._cwd = cwd
261
+ self._env = env
262
+ self._pi_args = pi_args or []
263
+ self._proc: subprocess.Popen[str] | None = None
264
+ self._io_lock = threading.Lock()
265
+ self._abort_requested = False
266
+
267
+ @property
268
+ def running(self) -> bool:
269
+ return self._proc is not None and self._proc.poll() is None
270
+
271
+ def start(self) -> None:
272
+ if self.running:
273
+ return
274
+ command = [resolve_pi_executable(), "--mode", "rpc", *self._pi_args]
275
+ self._proc = subprocess.Popen(
276
+ command,
277
+ stdin=subprocess.PIPE,
278
+ stdout=subprocess.PIPE,
279
+ stderr=subprocess.PIPE,
280
+ encoding=_PI_SUBPROCESS_ENCODING,
281
+ errors=_PI_SUBPROCESS_ENCODING_ERRORS,
282
+ bufsize=1,
283
+ cwd=self._cwd,
284
+ env=self._env,
285
+ )
286
+
287
+ def close(self) -> None:
288
+ if not self._proc:
289
+ return
290
+ if self.running:
291
+ try:
292
+ self.abort()
293
+ except Exception:
294
+ pass
295
+ self._proc.terminate()
296
+ try:
297
+ self._proc.wait(timeout=5)
298
+ except subprocess.TimeoutExpired:
299
+ self._proc.kill()
300
+ self._proc = None
301
+
302
+ def _ensure_running(self) -> subprocess.Popen[str]:
303
+ if not self.running:
304
+ self.start()
305
+ assert self._proc is not None
306
+ return self._proc
307
+
308
+ def _read_line(self) -> dict[str, Any]:
309
+ proc = self._ensure_running()
310
+ assert proc.stdout is not None
311
+ with self._io_lock:
312
+ line = proc.stdout.readline()
313
+ if not line:
314
+ code = proc.poll()
315
+ err = ""
316
+ if proc.stderr is not None:
317
+ err = proc.stderr.read() or ""
318
+ raise PiRpcError(
319
+ f"Pi RPC process exited (code={code})."
320
+ + (f" stderr: {err[:500]}" if err else "")
321
+ )
322
+ line = line.rstrip("\r\n")
323
+ if not line:
324
+ return self._read_line()
325
+ return json.loads(line)
326
+
327
+ def _write_command(self, command: dict[str, Any]) -> None:
328
+ proc = self._ensure_running()
329
+ assert proc.stdin is not None
330
+ with self._io_lock:
331
+ proc.stdin.write(json.dumps(command) + "\n")
332
+ proc.stdin.flush()
333
+
334
+ def _send_command(
335
+ self,
336
+ command: dict[str, Any],
337
+ *,
338
+ wait_response: bool = True,
339
+ ) -> dict[str, Any] | None:
340
+ req_id = command.setdefault("id", str(uuid.uuid4()))
341
+ self._write_command(command)
342
+ if not wait_response:
343
+ return None
344
+ while True:
345
+ event = self._read_line()
346
+ if event.get("type") == "response" and event.get("id") == req_id:
347
+ if not event.get("success", False):
348
+ error = (
349
+ event.get("error") or event.get("message") or "command failed"
350
+ )
351
+ raise PiRpcError(str(error))
352
+ return event
353
+
354
+ def abort(self) -> None:
355
+ """Request abort without reading stdout (the active stream consumer drains events)."""
356
+ if not self.running:
357
+ return
358
+ self._abort_requested = True
359
+ try:
360
+ self._send_command({"type": "abort"}, wait_response=False)
361
+ except OSError:
362
+ pass
363
+
364
+ @property
365
+ def abort_requested(self) -> bool:
366
+ return self._abort_requested
367
+
368
+ def clear_abort(self) -> None:
369
+ self._abort_requested = False
370
+
371
+ def new_session(self) -> None:
372
+ self._send_command({"type": "new_session"})
373
+
374
+ def get_state(self) -> dict[str, Any]:
375
+ response = self._send_command({"type": "get_state"})
376
+ data = response.get("data") if response else {}
377
+ return data if isinstance(data, dict) else {}
378
+
379
+ def get_messages(self) -> list[dict[str, Any]]:
380
+ response = self._send_command({"type": "get_messages"})
381
+ data = response.get("data") if response else {}
382
+ messages = data.get("messages") if isinstance(data, dict) else []
383
+ return messages if isinstance(messages, list) else []
384
+
385
+ def get_session_stats(self) -> dict[str, Any]:
386
+ """Token usage and cost totals for the active session (Pi RPC ``get_session_stats``)."""
387
+ response = self._send_command({"type": "get_session_stats"})
388
+ data = response.get("data") if response else {}
389
+ return data if isinstance(data, dict) else {}
390
+
391
+ def set_model(self, provider: str, model_id: str) -> dict[str, Any]:
392
+ response = self._send_command(
393
+ {
394
+ "type": "set_model",
395
+ "provider": provider,
396
+ "modelId": model_id,
397
+ }
398
+ )
399
+ data = response.get("data") if response else {}
400
+ return data if isinstance(data, dict) else {}
401
+
402
+ def get_available_models(self) -> list[dict[str, Any]]:
403
+ response = self._send_command({"type": "get_available_models"})
404
+ data = response.get("data") if response else {}
405
+ models = data.get("models") if isinstance(data, dict) else []
406
+ return models if isinstance(models, list) else []
407
+
408
+ def restart(self) -> None:
409
+ self.close()
410
+ self.start()
411
+
412
+ def prompt_events(self, message: str) -> Iterator[PiStreamEvent]:
413
+ """Send a user message and yield structured events until ``agent_end``."""
414
+ self.clear_abort()
415
+ req_id = str(uuid.uuid4())
416
+ self._send_command(
417
+ {"id": req_id, "type": "prompt", "message": message},
418
+ wait_response=False,
419
+ )
420
+
421
+ while True:
422
+ event = self._read_line()
423
+ if event.get("type") == "response" and event.get("id") == req_id:
424
+ if not event.get("success", False):
425
+ error = (
426
+ event.get("error") or event.get("message") or "prompt rejected"
427
+ )
428
+ yield PiStreamEvent(kind="error", text=str(error), is_error=True)
429
+ return
430
+ break
431
+
432
+ yield from self._iter_agent_events()
433
+
434
+ def _iter_agent_events(self) -> Iterator[PiStreamEvent]:
435
+ while True:
436
+ event = self._read_line()
437
+ event_type = event.get("type")
438
+
439
+ if event_type == "agent_start":
440
+ yield PiStreamEvent(kind="status", text="Agent started…")
441
+
442
+ elif event_type == "turn_start":
443
+ yield PiStreamEvent(kind="status", text="Turn started.")
444
+
445
+ elif event_type == "turn_end":
446
+ yield PiStreamEvent(kind="turn_end", text="Turn finished.")
447
+
448
+ elif event_type == "message_update":
449
+ yield from self._parse_message_update(event)
450
+
451
+ elif event_type == "tool_execution_start":
452
+ tool_name = event.get("toolName")
453
+ tool_args = (
454
+ event.get("args") if isinstance(event.get("args"), dict) else {}
455
+ )
456
+ yield PiStreamEvent(
457
+ kind="tool_start",
458
+ tool_name=str(tool_name) if tool_name else "tool",
459
+ tool_call_id=event.get("toolCallId"),
460
+ tool_args=tool_args,
461
+ text=format_tool_args(
462
+ str(tool_name) if tool_name else None,
463
+ tool_args,
464
+ ),
465
+ )
466
+
467
+ elif event_type == "tool_execution_update":
468
+ output = extract_tool_text(event)
469
+ yield PiStreamEvent(
470
+ kind="tool_update",
471
+ tool_name=event.get("toolName"),
472
+ tool_call_id=event.get("toolCallId"),
473
+ tool_output=output,
474
+ )
475
+
476
+ elif event_type == "tool_execution_end":
477
+ result = (
478
+ event.get("result") if isinstance(event.get("result"), dict) else {}
479
+ )
480
+ output = extract_tool_text(result)
481
+ yield PiStreamEvent(
482
+ kind="tool_end",
483
+ tool_name=event.get("toolName"),
484
+ tool_call_id=event.get("toolCallId"),
485
+ tool_output=output,
486
+ is_error=bool(event.get("isError")),
487
+ )
488
+
489
+ elif event_type == "queue_update":
490
+ steering = event.get("steering") or []
491
+ follow_up = event.get("followUp") or []
492
+ if steering or follow_up:
493
+ yield PiStreamEvent(
494
+ kind="status",
495
+ text="Queue updated.",
496
+ meta={"steering": steering, "follow_up": follow_up},
497
+ )
498
+
499
+ elif event_type == "compaction_start":
500
+ reason = event.get("reason") or "unknown"
501
+ yield PiStreamEvent(
502
+ kind="status",
503
+ text=f"Compaction started ({reason})…",
504
+ meta={"reason": reason},
505
+ )
506
+
507
+ elif event_type == "compaction_end":
508
+ if event.get("aborted"):
509
+ text = "Compaction aborted."
510
+ elif event.get("errorMessage"):
511
+ text = f"Compaction failed: {event['errorMessage']}"
512
+ yield PiStreamEvent(kind="error", text=text, is_error=True)
513
+ continue
514
+ elif event.get("willRetry"):
515
+ text = "Compaction complete — retrying prompt…"
516
+ else:
517
+ tokens = (event.get("result") or {}).get("tokensBefore")
518
+ text = (
519
+ f"Compaction complete ({tokens:,} tokens before)."
520
+ if isinstance(tokens, int)
521
+ else "Compaction complete."
522
+ )
523
+ yield PiStreamEvent(kind="status", text=text, meta=event)
524
+
525
+ elif event_type == "auto_retry_start":
526
+ attempt = event.get("attempt")
527
+ max_attempts = event.get("maxAttempts")
528
+ delay_ms = event.get("delayMs")
529
+ msg = event.get("errorMessage") or "transient error"
530
+ yield PiStreamEvent(
531
+ kind="status",
532
+ text=(
533
+ f"Auto-retry {attempt}/{max_attempts} in {delay_ms}ms "
534
+ f"({str(msg)[:120]})"
535
+ ),
536
+ meta=event,
537
+ )
538
+
539
+ elif event_type == "auto_retry_end":
540
+ if event.get("success"):
541
+ yield PiStreamEvent(
542
+ kind="status",
543
+ text=f"Auto-retry succeeded on attempt {event.get('attempt')}.",
544
+ )
545
+ else:
546
+ yield PiStreamEvent(
547
+ kind="error",
548
+ text=f"Auto-retry failed: {event.get('finalError', 'unknown error')}",
549
+ is_error=True,
550
+ )
551
+
552
+ elif event_type == "extension_error":
553
+ yield PiStreamEvent(
554
+ kind="error",
555
+ text=str(event.get("error") or "extension error"),
556
+ is_error=True,
557
+ )
558
+
559
+ elif event_type == "agent_end":
560
+ aborted = self._abort_requested
561
+ self.clear_abort()
562
+ yield PiStreamEvent(
563
+ kind="done",
564
+ text="Agent aborted." if aborted else "Agent finished.",
565
+ )
566
+ return
567
+
568
+ def _parse_message_update(self, event: dict[str, Any]) -> Iterator[PiStreamEvent]:
569
+ delta = event.get("assistantMessageEvent") or {}
570
+ delta_type = delta.get("type")
571
+ partial = partial_message_from_update(event)
572
+ if partial is not None:
573
+ visible, thinking = extract_assistant_display(partial)
574
+ if visible.strip():
575
+ yield PiStreamEvent(kind="text_snapshot", text=visible)
576
+ elif chat_text := chat_text_from_assistant_message(partial):
577
+ yield PiStreamEvent(kind="text_snapshot", text=chat_text)
578
+ if thinking.strip():
579
+ yield PiStreamEvent(kind="thinking_snapshot", text=thinking)
580
+
581
+ if delta_type == "text_delta":
582
+ chunk = delta.get("delta") or ""
583
+ if chunk:
584
+ yield PiStreamEvent(kind="text_delta", text=chunk)
585
+
586
+ elif delta_type == "thinking_delta":
587
+ chunk = delta.get("delta") or ""
588
+ if chunk:
589
+ yield PiStreamEvent(kind="thinking_delta", text=chunk)
590
+
591
+ elif delta_type == "toolcall_start":
592
+ tool_call = delta.get("toolCall") or {}
593
+ tool_name = tool_call.get("name") or delta.get("toolName") or "tool"
594
+ tool_args = tool_call.get("arguments")
595
+ if isinstance(tool_args, str):
596
+ try:
597
+ tool_args = json.loads(tool_args)
598
+ except json.JSONDecodeError:
599
+ tool_args = {"raw": tool_args}
600
+ if not isinstance(tool_args, dict):
601
+ tool_args = {}
602
+ detail = format_tool_args(str(tool_name), tool_args)
603
+ chat_line = f"**{tool_name}:** {detail}" if detail else f"**{tool_name}**"
604
+ yield PiStreamEvent(kind="text_snapshot", text=chat_line)
605
+
606
+ elif delta_type == "error":
607
+ yield PiStreamEvent(
608
+ kind="error",
609
+ text=str(
610
+ delta.get("message") or delta.get("error") or "generation error"
611
+ ),
612
+ is_error=True,
613
+ )
614
+
615
+ def prompt_stream(
616
+ self, message: str, *, show_tool_status: bool = True
617
+ ) -> Iterator[str]:
618
+ """Backward-compatible text stream (assistant visible text + optional tool status)."""
619
+ for event in self.prompt_events(message):
620
+ if event.kind == "text_delta":
621
+ yield event.text
622
+ elif show_tool_status and event.kind == "tool_start":
623
+ yield f"\n\n_[Running {event.tool_name}…]_\n"
624
+ elif event.kind == "error":
625
+ yield f"\n\n**Error:** {event.text}\n"
626
+
627
+
628
+ def default_client(session_hash: str | None = None) -> PiRpcClient:
629
+ from pi_agent_config import configure_aws_credentials
630
+ from pi_workspace_skills import ensure_workspace_skills, pi_rpc_args, pi_rpc_cwd
631
+
632
+ configure_aws_credentials()
633
+ ensure_workspace_skills()
634
+ env = os.environ.copy()
635
+ env.setdefault("HOME", os.path.expanduser("~"))
636
+ env.setdefault("PYTHONUTF8", "1")
637
+ env.setdefault("PYTHONIOENCODING", "utf-8")
638
+ from session_workspace import workspace_base_dir
639
+
640
+ env.setdefault("PI_WORKSPACE_DIR", str(workspace_base_dir()))
641
+ if not env.get("GEMINI_API_KEY") and env.get("GOOGLE_API_KEY"):
642
+ env["GEMINI_API_KEY"] = env["GOOGLE_API_KEY"]
643
+ if not env.get("HF_TOKEN") and env.get("DOC_REDACTION_HF_TOKEN"):
644
+ env["HF_TOKEN"] = env["DOC_REDACTION_HF_TOKEN"]
645
+ return PiRpcClient(
646
+ cwd=pi_rpc_cwd(session_hash),
647
+ env=env,
648
+ pi_args=pi_rpc_args(),
649
+ )
agent-redact/pi/pi_session_usage.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summarize Pi agent LLM token usage for usage-log CSV rows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from pi_rpc_client import PiRpcClient, PiRpcError
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class TokenUsageTotals:
15
+ """Pi session usage (see Pi session-format ``Usage``)."""
16
+
17
+ input: int = 0
18
+ output: int = 0
19
+ cache_read: int = 0
20
+ cache_write: int = 0
21
+
22
+ @property
23
+ def llm_input_tokens(self) -> int:
24
+ """Input-side tokens for the main-app usage log (input + cache)."""
25
+ return self.input + self.cache_read + self.cache_write
26
+
27
+ @property
28
+ def llm_output_tokens(self) -> int:
29
+ return self.output
30
+
31
+
32
+ def _int_field(raw: Any) -> int:
33
+ try:
34
+ return max(0, int(raw or 0))
35
+ except (TypeError, ValueError):
36
+ return 0
37
+
38
+
39
+ def totals_from_usage_dict(usage: dict[str, Any] | None) -> TokenUsageTotals:
40
+ if not usage:
41
+ return TokenUsageTotals()
42
+ return TokenUsageTotals(
43
+ input=_int_field(usage.get("input")),
44
+ output=_int_field(usage.get("output")),
45
+ cache_read=_int_field(usage.get("cacheRead")),
46
+ cache_write=_int_field(usage.get("cacheWrite")),
47
+ )
48
+
49
+
50
+ def totals_from_stats_payload(data: dict[str, Any] | None) -> TokenUsageTotals:
51
+ if not data:
52
+ return TokenUsageTotals()
53
+ tokens = data.get("tokens")
54
+ if isinstance(tokens, dict):
55
+ return totals_from_usage_dict(tokens)
56
+ return TokenUsageTotals()
57
+
58
+
59
+ def subtract_usage(
60
+ after: TokenUsageTotals, before: TokenUsageTotals
61
+ ) -> TokenUsageTotals:
62
+ return TokenUsageTotals(
63
+ input=max(0, after.input - before.input),
64
+ output=max(0, after.output - before.output),
65
+ cache_read=max(0, after.cache_read - before.cache_read),
66
+ cache_write=max(0, after.cache_write - before.cache_write),
67
+ )
68
+
69
+
70
+ def add_usage(left: TokenUsageTotals, right: TokenUsageTotals) -> TokenUsageTotals:
71
+ return TokenUsageTotals(
72
+ input=left.input + right.input,
73
+ output=left.output + right.output,
74
+ cache_read=left.cache_read + right.cache_read,
75
+ cache_write=left.cache_write + right.cache_write,
76
+ )
77
+
78
+
79
+ def sum_usage_from_messages(
80
+ messages: list[dict[str, Any]],
81
+ *,
82
+ since_last_user: bool = False,
83
+ ) -> TokenUsageTotals:
84
+ """Sum ``usage`` on assistant messages (optional: only after the last user turn)."""
85
+ last_user = -1
86
+ if since_last_user:
87
+ for index, message in enumerate(messages):
88
+ if message.get("role") == "user":
89
+ last_user = index
90
+ messages = messages[last_user + 1 :] if last_user >= 0 else messages
91
+
92
+ total = TokenUsageTotals()
93
+ for message in messages:
94
+ if message.get("role") != "assistant":
95
+ continue
96
+ usage = message.get("usage")
97
+ if isinstance(usage, dict):
98
+ total = add_usage(total, totals_from_usage_dict(usage))
99
+ return total
100
+
101
+
102
+ def sum_usage_from_jsonl(path: Path) -> TokenUsageTotals:
103
+ """Parse a Pi session JSONL file and sum assistant ``usage`` blocks."""
104
+ total = TokenUsageTotals()
105
+ try:
106
+ text = path.read_text(encoding="utf-8")
107
+ except OSError:
108
+ return total
109
+ for line in text.splitlines():
110
+ stripped = line.strip()
111
+ if not stripped:
112
+ continue
113
+ try:
114
+ entry = json.loads(stripped)
115
+ except json.JSONDecodeError:
116
+ continue
117
+ if entry.get("type") != "message":
118
+ continue
119
+ message = entry.get("message")
120
+ if not isinstance(message, dict) or message.get("role") != "assistant":
121
+ continue
122
+ usage = message.get("usage")
123
+ if isinstance(usage, dict):
124
+ total = add_usage(total, totals_from_usage_dict(usage))
125
+ return total
126
+
127
+
128
+ def resolve_session_token_usage(client: PiRpcClient | None) -> TokenUsageTotals:
129
+ """
130
+ Best-effort session usage from Pi RPC ``get_session_stats``, live messages, or JSONL.
131
+ """
132
+ if client is None or not client.running:
133
+ return TokenUsageTotals()
134
+
135
+ try:
136
+ stats = client.get_session_stats()
137
+ totals = totals_from_stats_payload(stats)
138
+ if totals.input or totals.output or totals.cache_read or totals.cache_write:
139
+ return totals
140
+ except PiRpcError:
141
+ pass
142
+
143
+ try:
144
+ messages = client.get_messages()
145
+ totals = sum_usage_from_messages(messages)
146
+ if totals.input or totals.output or totals.cache_read or totals.cache_write:
147
+ return totals
148
+ except PiRpcError:
149
+ pass
150
+
151
+ from session_logs import pi_session_file_from_client
152
+
153
+ session_file = pi_session_file_from_client(client)
154
+ if session_file is not None:
155
+ return sum_usage_from_jsonl(session_file)
156
+ return TokenUsageTotals()
157
+
158
+
159
+ def usage_for_completed_turn(
160
+ client: PiRpcClient | None,
161
+ baseline: TokenUsageTotals | None,
162
+ ) -> TokenUsageTotals:
163
+ """
164
+ Tokens consumed by the prompt that just finished.
165
+
166
+ Prefers delta from *baseline* (captured before ``prompt_events``). Falls back to
167
+ summing assistant ``usage`` since the last user message, then whole-session totals.
168
+ """
169
+ if client is None or not client.running:
170
+ return TokenUsageTotals()
171
+
172
+ current = resolve_session_token_usage(client)
173
+ if baseline is not None:
174
+ delta = subtract_usage(current, baseline)
175
+ if delta.input or delta.output or delta.cache_read or delta.cache_write:
176
+ return delta
177
+
178
+ try:
179
+ turn = sum_usage_from_messages(client.get_messages(), since_last_user=True)
180
+ if turn.input or turn.output or turn.cache_read or turn.cache_write:
181
+ return turn
182
+ except PiRpcError:
183
+ pass
184
+
185
+ return current
agent-redact/pi/pi_workspace_skills.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sync doc_redaction skills into the Pi workspace and constrain Pi RPC to that tree."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import stat
8
+ from pathlib import Path
9
+
10
+ from bootstrap_pi_config import pi_repo_root_path
11
+
12
+
13
+ def workspace_base_dir() -> Path:
14
+ from session_workspace import workspace_base_dir as _base
15
+
16
+ return _base()
17
+
18
+
19
+ def workspace_pi_dir() -> Path:
20
+ return workspace_base_dir() / ".pi"
21
+
22
+
23
+ def workspace_skills_dir() -> Path:
24
+ return workspace_pi_dir() / "skills"
25
+
26
+
27
+ def repo_skills_dir() -> Path:
28
+ return pi_repo_root_path() / "skills"
29
+
30
+
31
+ def _env_flag(name: str) -> bool:
32
+ return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"}
33
+
34
+
35
+ def _should_resync(dest: Path, src: Path) -> bool:
36
+ if _env_flag("PI_SKILLS_RESYNC"):
37
+ return True
38
+ if not dest.is_dir():
39
+ return True
40
+ if not any(dest.iterdir()):
41
+ return True
42
+ try:
43
+ return src.stat().st_mtime > dest.stat().st_mtime
44
+ except OSError:
45
+ return True
46
+
47
+
48
+ def _copy_tree_item(src: Path, dest: Path) -> None:
49
+ if src.is_dir():
50
+ if dest.exists():
51
+ for child in src.iterdir():
52
+ _copy_tree_item(child, dest / child.name)
53
+ else:
54
+ shutil.copytree(src, dest, copy_function=shutil.copy2)
55
+ return
56
+ dest.parent.mkdir(parents=True, exist_ok=True)
57
+ shutil.copy2(src, dest)
58
+
59
+
60
+ def _make_readonly(path: Path) -> None:
61
+ if _env_flag("PI_SKILLS_WRITABLE"):
62
+ return
63
+ try:
64
+ if path.is_dir():
65
+ for root, dirs, files in os.walk(path):
66
+ root_path = Path(root)
67
+ for name in files:
68
+ file_path = root_path / name
69
+ mode = file_path.stat().st_mode
70
+ file_path.chmod(
71
+ mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH
72
+ )
73
+ for name in dirs:
74
+ dir_path = root_path / name
75
+ mode = dir_path.stat().st_mode
76
+ dir_path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
77
+ mode = path.stat().st_mode
78
+ path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
79
+ else:
80
+ mode = path.stat().st_mode
81
+ path.chmod(mode & ~stat.S_IWUSR & ~stat.S_IWGRP & ~stat.S_IWOTH)
82
+ except OSError:
83
+ pass
84
+
85
+
86
+ def write_workspace_pi_settings() -> Path:
87
+ """
88
+ Project Pi settings under ``{workspace}/.pi/settings.json``.
89
+
90
+ Paths in that file resolve relative to ``{workspace}/.pi/`` per Pi docs.
91
+ """
92
+ pi_dir = workspace_pi_dir()
93
+ pi_dir.mkdir(parents=True, exist_ok=True)
94
+ settings_path = pi_dir / "settings.json"
95
+ payload = {
96
+ "skills": ["skills"],
97
+ "extensions": [],
98
+ "packages": [],
99
+ "enableSkillCommands": True,
100
+ }
101
+ import json
102
+
103
+ settings_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
104
+ return settings_path
105
+
106
+
107
+ def sync_repo_skills_to_workspace(*, force: bool = False) -> Path:
108
+ """
109
+ Copy ``{repo}/skills/`` → ``{workspace}/.pi/skills/`` (read-only for the agent).
110
+
111
+ Re-sync when the repo tree is newer or ``PI_SKILLS_RESYNC=true``.
112
+ """
113
+ src = repo_skills_dir()
114
+ dest = workspace_skills_dir()
115
+ workspace_pi_dir().mkdir(parents=True, exist_ok=True)
116
+
117
+ if not src.is_dir():
118
+ dest.mkdir(parents=True, exist_ok=True)
119
+ write_workspace_pi_settings()
120
+ return dest
121
+
122
+ if force or _should_resync(dest, src):
123
+ if dest.exists():
124
+ shutil.rmtree(dest, ignore_errors=True)
125
+ dest.mkdir(parents=True, exist_ok=True)
126
+ for item in sorted(src.iterdir()):
127
+ _copy_tree_item(item, dest / item.name)
128
+
129
+ _make_readonly(dest)
130
+ write_workspace_pi_settings()
131
+ os.environ["PI_WORKSPACE_SKILLS_DIR"] = str(dest.resolve())
132
+ return dest.resolve()
133
+
134
+
135
+ def ensure_workspace_skills(*, force: bool = False) -> Path:
136
+ """Idempotent sync used at app startup and before Pi RPC starts."""
137
+ return sync_repo_skills_to_workspace(force=force)
138
+
139
+
140
+ def partnership_template_in_workspace() -> Path | None:
141
+ path = workspace_skills_dir() / "Example prompt partnership.txt"
142
+ return path if path.is_file() else None
143
+
144
+
145
+ def pi_rpc_cwd(session_hash: str | None = None) -> str:
146
+ """Subprocess cwd for ``pi --mode rpc`` (session subfolder when enabled)."""
147
+ from session_workspace import session_workspace_dir, session_workspace_enabled
148
+
149
+ base = workspace_base_dir()
150
+ if session_hash and session_hash.strip() and session_workspace_enabled():
151
+ return str(session_workspace_dir(session_hash))
152
+ return str(base)
153
+
154
+
155
+ def pi_rpc_args() -> list[str]:
156
+ """Load only workspace skills; do not discover repo ``skills/`` via ancestors."""
157
+ skills_dir = ensure_workspace_skills()
158
+ return ["--no-skills", "--skill", str(skills_dir)]
159
+
160
+
161
+ def workspace_boundary_prefix(session_hash: str | None = None) -> str:
162
+ """Extra prompt text: workspace root, skills path, and path rules."""
163
+ base = workspace_base_dir().as_posix().rstrip("/")
164
+ skills = workspace_skills_dir().as_posix()
165
+ from session_workspace import session_workspace_dir, session_workspace_enabled
166
+
167
+ if session_hash and session_hash.strip() and session_workspace_enabled():
168
+ root = session_workspace_dir(session_hash).as_posix().rstrip("/")
169
+ scope = f"your session folder `{root}/`"
170
+ else:
171
+ root = base
172
+ scope = f"the workspace `{base}/`"
173
+
174
+ return (
175
+ f"**Workspace boundary (mandatory):** work only under `{base}/`. "
176
+ f"Your active directory is {scope}. "
177
+ f"Do not read, write, or run shell commands targeting paths outside `{base}/` "
178
+ f"(including the git checkout and `agent-redact/`). "
179
+ f"**Skills (read-only):** doc_redaction skills are synced to `{skills}/`. "
180
+ f"Use `/skill:doc-redaction-app`, `/skill:doc-redact-page-review`, etc. "
181
+ f"Do not edit files under `{skills}/`.\n\n"
182
+ )
agent-redact/pi/redaction_prompt.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build Pi redaction task prompts from the partnership example template."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+ from pi_agent_config import is_hf_space_profile
12
+ from session_workspace import workspace_base_dir
13
+
14
+
15
+ def upload_root() -> Path:
16
+ """Gradio upload directory (created by ``bootstrap_pi_config.ensure_pi_upload_root``)."""
17
+ raw = (os.environ.get("PI_UPLOAD_ROOT") or "").strip()
18
+ if not raw:
19
+ from bootstrap_pi_config import ensure_pi_upload_root
20
+
21
+ raw = ensure_pi_upload_root(pi_repo_root())
22
+ path = Path(raw)
23
+ path.mkdir(parents=True, exist_ok=True)
24
+ return path.resolve()
25
+
26
+
27
+ _SAFE_UPLOAD_FILENAME_MAX_BYTES = 255
28
+ # Path separators, nulls, and characters unsafe on common filesystems — not general punctuation.
29
+ _UNSAFE_UPLOAD_FILENAME_CHARS_RE = re.compile(r'[\x00-\x1f<>:"|?*\\/]')
30
+
31
+
32
+ def _truncate_upload_filename(
33
+ name: str, *, max_bytes: int = _SAFE_UPLOAD_FILENAME_MAX_BYTES
34
+ ) -> str:
35
+ encoded = name.encode("utf-8")
36
+ if len(encoded) <= max_bytes:
37
+ return name
38
+ stem, suffix = os.path.splitext(name)
39
+ suffix_bytes = suffix.encode("utf-8")
40
+ max_stem_bytes = max(1, max_bytes - len(suffix_bytes))
41
+ while stem and len(stem.encode("utf-8")) > max_stem_bytes:
42
+ stem = stem[:-1]
43
+ if not stem:
44
+ stem = "file"
45
+ return stem + suffix
46
+
47
+
48
+ def _split_upload_basename(name: str) -> tuple[str, str]:
49
+ """Split an upload basename into stem and extension (handles ``.pdf`` on Windows)."""
50
+ if re.fullmatch(r"\.[^./\\]+", name):
51
+ return "", name
52
+ path = Path(name)
53
+ return path.stem, path.suffix
54
+
55
+
56
+ def _workspace_filename_from_upload(name: str) -> tuple[str, str, bool]:
57
+ """
58
+ Derive a workspace-safe basename, changing the name only when required for security.
59
+
60
+ Returns ``(original_basename, workspace_basename, renamed)``.
61
+ """
62
+ original = Path(name).name.strip()
63
+ if not original or original in {".", ".."}:
64
+ raise ValueError("Uploaded file has an invalid name.")
65
+ if "\x00" in original or "/" in original or "\\" in original:
66
+ raise ValueError("Uploaded file has an invalid name.")
67
+
68
+ stem, suffix = _split_upload_basename(original)
69
+ safe_stem = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", stem)
70
+ safe_suffix = _UNSAFE_UPLOAD_FILENAME_CHARS_RE.sub("_", suffix)
71
+ safe_stem = safe_stem.strip(". ")
72
+ if not safe_stem:
73
+ safe_stem = "file"
74
+ safe_name = _truncate_upload_filename(safe_stem + safe_suffix)
75
+ return original, safe_name, safe_name != original
76
+
77
+
78
+ _PARTNERSHIP_TEMPLATE = Path("skills") / "Example prompt partnership.txt"
79
+
80
+
81
+ def _workspace_root() -> Path:
82
+ return workspace_base_dir()
83
+
84
+
85
+ def pi_repo_root() -> Path:
86
+ """Monorepo checkout root (skills/, config/). Set via :func:`bootstrap_pi_config.ensure_pi_workdir`."""
87
+ from bootstrap_pi_config import pi_repo_root_path
88
+
89
+ return pi_repo_root_path()
90
+
91
+
92
+ def partnership_template_path() -> Path:
93
+ from pi_workspace_skills import partnership_template_in_workspace
94
+
95
+ synced = partnership_template_in_workspace()
96
+ if synced is not None:
97
+ return synced
98
+ return pi_repo_root() / _PARTNERSHIP_TEMPLATE
99
+
100
+
101
+ HF_DEFAULT_OCR = "Local model - selectable text"
102
+ HF_DEFAULT_PII = "Local"
103
+ HF_DEFAULT_GRADIO_URL = "https://seanpedrickcase-document-redaction.hf.space"
104
+
105
+ # Used only when PI_DEFAULT_OCR_METHOD / PI_DEFAULT_PII_METHOD are unset (local-docker profile).
106
+ _FALLBACK_LOCAL_OCR = "hybrid-paddle-inference-server"
107
+ _FALLBACK_LOCAL_PII = "Local"
108
+
109
+
110
+ def _env_default(key: str, *, hf_default: str, local_fallback: str) -> str:
111
+ """Resolve Pi redaction defaults from env (e.g. config/pi_agent.env) with profile fallbacks."""
112
+ explicit = (os.environ.get(key) or "").strip()
113
+ if explicit:
114
+ return explicit
115
+ if is_hf_space_profile():
116
+ return hf_default
117
+ return local_fallback
118
+
119
+
120
+ DEFAULT_OCR_METHOD = _env_default(
121
+ "PI_DEFAULT_OCR_METHOD",
122
+ hf_default=HF_DEFAULT_OCR,
123
+ local_fallback=_FALLBACK_LOCAL_OCR,
124
+ )
125
+ DEFAULT_PII_METHOD = _env_default(
126
+ "PI_DEFAULT_PII_METHOD",
127
+ hf_default=HF_DEFAULT_PII,
128
+ local_fallback=_FALLBACK_LOCAL_PII,
129
+ )
130
+
131
+ OCR_METHOD_CHOICES: tuple[str, ...] = (
132
+ "hybrid-paddle-inference-server",
133
+ "hybrid-paddle-vlm",
134
+ "Local model - selectable text",
135
+ "Local OCR",
136
+ "AWS Textract service - all PDF types",
137
+ "tesseract",
138
+ "paddle",
139
+ "hybrid-paddle",
140
+ "vlm",
141
+ "inference-server",
142
+ )
143
+
144
+ PII_METHOD_CHOICES: tuple[str, ...] = (
145
+ "Local",
146
+ "AWS Comprehend",
147
+ "LLM (AWS Bedrock)",
148
+ "Local inference server",
149
+ "Local transformers LLM",
150
+ "Only extract text (no redaction)",
151
+ )
152
+
153
+ _DEFAULT_MAX_PAGES = 3000
154
+
155
+
156
+ def max_pages_limit() -> int:
157
+ """
158
+ Maximum PDF pages allowed for a Pi redaction task.
159
+
160
+ Resolution order: ``PI_MAX_PAGES`` → ``MAX_PAGES`` → ``MAX_DOC_PAGES`` → 3000.
161
+ """
162
+ for key in ("PI_MAX_PAGES", "MAX_PAGES", "MAX_DOC_PAGES"):
163
+ raw = (os.environ.get(key) or "").strip()
164
+ if raw:
165
+ value = int(raw)
166
+ if value < 1:
167
+ raise ValueError(f"{key} must be a positive integer.")
168
+ return value
169
+ return _DEFAULT_MAX_PAGES
170
+
171
+
172
+ def pages_to_process_count(page_range: str, total_pages: int) -> int:
173
+ """Return how many pages ``page_range`` selects from a ``total_pages`` PDF."""
174
+ if total_pages < 1:
175
+ raise ValueError("PDF has no pages.")
176
+
177
+ text = (page_range or "all").strip().lower()
178
+ if not text or text == "all":
179
+ return total_pages
180
+
181
+ if "-" in text:
182
+ start_text, end_text = text.split("-", 1)
183
+ try:
184
+ start = int(start_text.strip())
185
+ end = int(end_text.strip())
186
+ except ValueError as exc:
187
+ raise ValueError(f"Invalid page range: {page_range!r}") from exc
188
+ if start < 1 or end < start:
189
+ raise ValueError(f"Invalid page range: {page_range!r}")
190
+ if end > total_pages:
191
+ raise ValueError(
192
+ f"Page range {page_range!r} exceeds document length "
193
+ f"({total_pages} pages)."
194
+ )
195
+ return end - start + 1
196
+
197
+ try:
198
+ page = int(text)
199
+ except ValueError as exc:
200
+ raise ValueError(f"Invalid page range: {page_range!r}") from exc
201
+ if page < 1 or page > total_pages:
202
+ raise ValueError(
203
+ f"Page {page} is out of range (document has {total_pages} pages)."
204
+ )
205
+ return 1
206
+
207
+
208
+ def pdf_page_count(file_path: str | Path) -> int:
209
+ import pymupdf
210
+
211
+ path = Path(file_path)
212
+ with pymupdf.open(path) as doc:
213
+ return int(doc.page_count)
214
+
215
+
216
+ def validate_pdf_page_limit(
217
+ file_path: str | Path,
218
+ *,
219
+ page_range: str = "all",
220
+ max_pages: int | None = None,
221
+ ) -> None:
222
+ """Reject PDFs whose selected page count exceeds ``max_pages_limit()``."""
223
+ path = Path(file_path)
224
+ if path.suffix.lower() != ".pdf":
225
+ return
226
+
227
+ limit = max_pages if max_pages is not None else max_pages_limit()
228
+ try:
229
+ total = pdf_page_count(path)
230
+ except Exception as exc:
231
+ raise ValueError(f"Could not read PDF page count for {path.name}.") from exc
232
+
233
+ count = pages_to_process_count(page_range, total)
234
+ if count > limit:
235
+ scope = page_range.strip() or "all"
236
+ raise ValueError(
237
+ f"Number of pages to process ({count}) exceeds the maximum allowed "
238
+ f"({limit}). Submit a smaller document or narrow the page range "
239
+ f"({scope!r})."
240
+ )
241
+
242
+
243
+ @dataclass(frozen=True)
244
+ class RedactionTaskSettings:
245
+ ocr_method: str = DEFAULT_OCR_METHOD
246
+ pii_method: str = DEFAULT_PII_METHOD
247
+ encourage_vlm_faces: bool = False if is_hf_space_profile() else True
248
+ encourage_vlm_signatures: bool = False if is_hf_space_profile() else True
249
+
250
+ @classmethod
251
+ def hf_space_defaults(cls) -> RedactionTaskSettings:
252
+ return cls(
253
+ ocr_method=HF_DEFAULT_OCR,
254
+ pii_method=HF_DEFAULT_PII,
255
+ encourage_vlm_faces=False,
256
+ encourage_vlm_signatures=False,
257
+ )
258
+
259
+ @classmethod
260
+ def from_ui(
261
+ cls,
262
+ ocr_method: str,
263
+ pii_method: str,
264
+ encourage_vlm_faces: bool,
265
+ encourage_vlm_signatures: bool,
266
+ ) -> RedactionTaskSettings:
267
+ ocr = (ocr_method or DEFAULT_OCR_METHOD).strip()
268
+ pii = (pii_method or DEFAULT_PII_METHOD).strip()
269
+ if ocr not in OCR_METHOD_CHOICES:
270
+ ocr = DEFAULT_OCR_METHOD
271
+ if pii not in PII_METHOD_CHOICES:
272
+ pii = DEFAULT_PII_METHOD
273
+ return cls(
274
+ ocr_method=ocr,
275
+ pii_method=pii,
276
+ encourage_vlm_faces=bool(encourage_vlm_faces),
277
+ encourage_vlm_signatures=bool(encourage_vlm_signatures),
278
+ )
279
+
280
+
281
+ def doc_redaction_gradio_url() -> str:
282
+ """
283
+ Base URL of the doc_redaction Gradio app used for ``/doc_redact`` and review APIs.
284
+
285
+ Set ``DOC_REDACTION_GRADIO_URL`` in ``config/pi_agent.env`` (or the process environment).
286
+ Loaded via ``tools.config`` when the Pi app starts (default local: ``http://127.0.0.1:7860``).
287
+ """
288
+ from tools.config import DOC_REDACTION_GRADIO_URL
289
+
290
+ return str(DOC_REDACTION_GRADIO_URL).strip().rstrip("/")
291
+
292
+
293
+ def _default_gradio_url() -> str:
294
+ """Back-compat alias for prompt template substitution."""
295
+ return doc_redaction_gradio_url()
296
+
297
+
298
+ def _default_vlm_base_url() -> str:
299
+ return os.environ.get("PI_VLM_BASE_URL", "http://llama-inference:8080")
300
+
301
+
302
+ def _default_vlm_model() -> str:
303
+ return os.environ.get("PI_VLM_MODEL", "unsloth/Qwen3.6-27B-MTP-GGUF")
304
+
305
+
306
+ def load_template(path: Path | None = None) -> str:
307
+ template_file = path or partnership_template_path()
308
+ if not template_file.is_file():
309
+ raise FileNotFoundError(f"Prompt template not found: {template_file}")
310
+ return template_file.read_text(encoding="utf-8")
311
+
312
+
313
+ def format_user_requirements(instructions: str) -> str:
314
+ lines: list[str] = []
315
+ for raw in instructions.strip().splitlines():
316
+ line = raw.strip()
317
+ if not line:
318
+ continue
319
+ if not line.startswith("-"):
320
+ line = f"- {line}"
321
+ lines.append(line)
322
+ return "\n".join(lines)
323
+
324
+
325
+ def replace_user_requirements_section(template: str, instructions: str) -> str:
326
+ marker = "## User redaction requirements"
327
+ idx = template.find(marker)
328
+ formatted = format_user_requirements(instructions)
329
+ if idx == -1:
330
+ return f"{template.rstrip()}\n\n{marker} (authoritative for this task)\n\n{formatted}\n"
331
+ head = template[:idx]
332
+ return f"{head}{marker} (authoritative for this task)\n\n{formatted}\n"
333
+
334
+
335
+ def _is_textract_ocr_method(ocr_method: str) -> bool:
336
+ lowered = ocr_method.casefold()
337
+ return "textract" in lowered or lowered in {"textract", "aws textract"}
338
+
339
+
340
+ def build_vlm_faces_guidance(encourage: bool) -> str:
341
+ if is_hf_space_profile():
342
+ return (
343
+ "Pass 2 VLM and CUSTOM_VLM_FACES are not available on this deployment. "
344
+ "Do not pass CUSTOM_VLM_FACES or request face detection."
345
+ )
346
+ if encourage:
347
+ return (
348
+ "If the user asks to redact faces, then pass the entity CUSTOM_VLM_FACES "
349
+ "in the initial redaction entity selection"
350
+ )
351
+ return (
352
+ "Do not pass CUSTOM_VLM_FACES in the initial redaction entity list unless "
353
+ "the user explicitly asks to redact faces"
354
+ )
355
+
356
+
357
+ def build_vlm_signature_guidance(encourage: bool, ocr_method: str) -> str:
358
+ if is_hf_space_profile():
359
+ return (
360
+ "Pass 2 VLM and CUSTOM_VLM_SIGNATURE are not available on this deployment. "
361
+ "Do not pass CUSTOM_VLM_SIGNATURE or request signature detection."
362
+ )
363
+ if encourage:
364
+ if _is_textract_ocr_method(ocr_method):
365
+ return (
366
+ "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
367
+ "entity in the initial redaction entity selection, unless the text extraction "
368
+ "option is AWS Textract, in which case the handwrite_signature_textbox parameter "
369
+ "for the doc_redact endpoint should include 'Extract signatures'"
370
+ )
371
+ return (
372
+ "If the user asked to redact signatures, then pass the CUSTOM_VLM_SIGNATURE "
373
+ "entity in the initial redaction entity selection"
374
+ )
375
+ return (
376
+ "Do not pass CUSTOM_VLM_SIGNATURE in the initial redaction entity list unless "
377
+ "the user explicitly asks to redact signatures"
378
+ )
379
+
380
+
381
+ def build_remote_backend_guidance(
382
+ *,
383
+ gradio_url: str,
384
+ output_base: str,
385
+ workspace_root: str,
386
+ ) -> str:
387
+ if not is_hf_space_profile():
388
+ return ""
389
+ return (
390
+ f"- **Remote redaction backend:** the doc_redaction app runs at `{gradio_url}` "
391
+ "(private Hugging Face Space). Use **`gradio_client` only** — upload local files "
392
+ f"with `handle_file()` from `{workspace_root.rstrip('/')}/`. "
393
+ "**Do not** call `/agent/*` routes or use server-side paths from the redaction container.\n"
394
+ f"- Download all `/doc_redact` and `/review_apply` outputs via "
395
+ f"`{gradio_url.rstrip('/')}/gradio_api/file=…` with "
396
+ "`Authorization: Bearer $HF_TOKEN` into `{output_base}` (create subdirs as needed).\n"
397
+ "- Run **`verify_redaction_coverage`** locally on downloaded CSV/PDF paths in this "
398
+ "workspace (pandas/PyMuPDF), not via Agent API.\n"
399
+ "- **Pass 2 VLM is not available** — do not call a VLM endpoint or use "
400
+ "`CUSTOM_VLM_FACES` / `CUSTOM_VLM_SIGNATURE` entities.\n"
401
+ "- Helper module: `agent-redact/pi/remote_redaction.py` (`make_redaction_client`, "
402
+ "`download_gradio_files`)."
403
+ ).format(output_base=output_base.rstrip("/") + "/")
404
+
405
+
406
+ def _resolve_and_validate_upload_path(upload_path: str | Path) -> Path:
407
+ if not isinstance(upload_path, (str, Path)):
408
+ raise ValueError("Uploaded file path has an invalid type.")
409
+ if not str(upload_path).strip():
410
+ raise ValueError("Uploaded file path is empty.")
411
+
412
+ root = upload_root()
413
+ raw_path = Path(upload_path).expanduser()
414
+ try:
415
+ source = raw_path.resolve(strict=True)
416
+ except FileNotFoundError as exc:
417
+ raise FileNotFoundError(f"Uploaded file not found: {raw_path}") from exc
418
+
419
+ try:
420
+ source.relative_to(root)
421
+ except ValueError as exc:
422
+ raise ValueError(
423
+ f"Uploaded file path resolves outside allowed upload root: {source}"
424
+ ) from exc
425
+ if not source.is_file():
426
+ raise FileNotFoundError(f"Uploaded file not found: {source}")
427
+ if source.is_symlink():
428
+ raise ValueError(f"Symlink uploads are not allowed: {source}")
429
+ return source
430
+
431
+
432
+ def _resolve_and_validate_workspace_dir(workspace_dir: Path | None) -> Path:
433
+ if workspace_dir is not None and not isinstance(workspace_dir, Path):
434
+ raise ValueError("Workspace path has an invalid type.")
435
+ base_root = _workspace_root().resolve()
436
+ candidate = (
437
+ workspace_dir if workspace_dir is not None else _workspace_root()
438
+ ).resolve()
439
+ try:
440
+ candidate.relative_to(base_root)
441
+ except ValueError as exc:
442
+ raise ValueError(
443
+ f"Workspace path resolves outside allowed workspace root: {candidate}"
444
+ ) from exc
445
+ return candidate
446
+
447
+
448
+ def copy_upload_to_workspace(
449
+ upload_path: str | Path,
450
+ *,
451
+ workspace_dir: Path | None = None,
452
+ ) -> tuple[Path, str | None]:
453
+ """
454
+ Copy upload into the session workspace.
455
+
456
+ Returns ``(destination_path, original_basename)`` where ``original_basename`` is
457
+ set only when the file was renamed for path safety.
458
+ """
459
+ source = _resolve_and_validate_upload_path(upload_path)
460
+ if not source.is_file():
461
+ raise FileNotFoundError(f"Uploaded file not found: {source}")
462
+ workspace_root = _resolve_and_validate_workspace_dir(workspace_dir)
463
+ workspace_root.mkdir(parents=True, exist_ok=True)
464
+ _original_name, safe_name, renamed = _workspace_filename_from_upload(source.name)
465
+ dest = (workspace_root / safe_name).resolve()
466
+ try:
467
+ dest.relative_to(workspace_root)
468
+ except ValueError as exc:
469
+ raise ValueError(f"Destination path is outside workspace: {dest}") from exc
470
+ if source != dest:
471
+ # copyfile only: copy2/copystat raises EPERM when overwriting on Docker Desktop bind mounts.
472
+ shutil.copyfile(source, dest)
473
+ return dest, (_original_name if renamed else None)
474
+
475
+
476
+ def build_redaction_prompt(
477
+ file_name: str,
478
+ user_instructions: str,
479
+ *,
480
+ page_range: str = "all",
481
+ template: str | None = None,
482
+ settings: RedactionTaskSettings | None = None,
483
+ workspace_dir: Path | None = None,
484
+ ) -> str:
485
+ if not file_name.strip():
486
+ raise ValueError("A document file name is required.")
487
+ if not user_instructions.strip():
488
+ raise ValueError("Redaction requirements are required (use bullet points).")
489
+
490
+ task_settings = settings or RedactionTaskSettings()
491
+ workspace_root = (workspace_dir or _workspace_root()).resolve()
492
+ file_name = Path(file_name).name
493
+ input_path = f"{workspace_root.as_posix().rstrip('/')}/{file_name}"
494
+ output_base = f"{workspace_root.as_posix().rstrip('/')}/redact/{file_name}/"
495
+
496
+ text = template if template is not None else load_template()
497
+ remote_guidance = build_remote_backend_guidance(
498
+ gradio_url=_default_gradio_url(),
499
+ output_base=output_base,
500
+ workspace_root=workspace_root.as_posix(),
501
+ )
502
+ replacements = {
503
+ "{FILE_NAME}": file_name,
504
+ "{INPUT_PATH}": input_path,
505
+ "{OUTPUT_BASE}": output_base,
506
+ "{GRADIO_URL}": _default_gradio_url(),
507
+ "{PAGE_RANGE}": page_range.strip() or "all",
508
+ "{VLM_BASE_URL}": _default_vlm_base_url(),
509
+ "{VLM_MODEL}": _default_vlm_model(),
510
+ "{DEFAULT_OCR_METHOD}": task_settings.ocr_method,
511
+ "{DEFAULT_PII_METHOD}": task_settings.pii_method,
512
+ "{VLM_FACES_GUIDANCE}": build_vlm_faces_guidance(
513
+ task_settings.encourage_vlm_faces
514
+ ),
515
+ "{VLM_SIGNATURE_GUIDANCE}": build_vlm_signature_guidance(
516
+ task_settings.encourage_vlm_signatures,
517
+ task_settings.ocr_method,
518
+ ),
519
+ }
520
+ if remote_guidance:
521
+ replacements["{REMOTE_BACKEND_GUIDANCE}"] = remote_guidance
522
+ else:
523
+ text = text.replace("- {REMOTE_BACKEND_GUIDANCE}\n", "")
524
+ for key, value in replacements.items():
525
+ text = text.replace(key, value)
526
+
527
+ return replace_user_requirements_section(text, user_instructions)
528
+
529
+
530
+ def prepare_redaction_task(
531
+ upload_path: str | Path | None,
532
+ user_instructions: str,
533
+ *,
534
+ page_range: str = "all",
535
+ settings: RedactionTaskSettings | None = None,
536
+ workspace_dir: Path | None = None,
537
+ ) -> tuple[str, str, str | None]:
538
+ """
539
+ Copy upload into workspace and return ``(file_name, full_prompt, renamed_from)``.
540
+
541
+ ``renamed_from`` is the original upload basename when it was adjusted for path
542
+ safety; otherwise ``None``.
543
+ """
544
+ if upload_path is None:
545
+ raise ValueError("Please upload a document.")
546
+ root = _resolve_and_validate_workspace_dir(workspace_dir)
547
+ validate_pdf_page_limit(upload_path, page_range=page_range)
548
+ dest, renamed_from = copy_upload_to_workspace(upload_path, workspace_dir=root)
549
+ prompt = build_redaction_prompt(
550
+ dest.name,
551
+ user_instructions,
552
+ page_range=page_range,
553
+ settings=settings,
554
+ workspace_dir=root,
555
+ )
556
+ return dest.name, prompt, renamed_from
agent-redact/pi/remote_redaction.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio client helpers for remote doc_redaction HF Space backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any
8
+ from urllib.parse import quote
9
+
10
+ import httpx
11
+ from gradio_client import Client
12
+
13
+ DEFAULT_CONNECT_TIMEOUT = 120.0
14
+ DEFAULT_READ_TIMEOUT = 1800.0
15
+
16
+
17
+ def redaction_base_url() -> str:
18
+ from redaction_prompt import doc_redaction_gradio_url
19
+
20
+ return doc_redaction_gradio_url()
21
+
22
+
23
+ def redaction_hf_token() -> str | None:
24
+ token = os.environ.get("HF_TOKEN") or os.environ.get("DOC_REDACTION_HF_TOKEN")
25
+ return token.strip() if token and token.strip() else None
26
+
27
+
28
+ def httpx_timeout(
29
+ *,
30
+ connect: float = DEFAULT_CONNECT_TIMEOUT,
31
+ read: float = DEFAULT_READ_TIMEOUT,
32
+ ) -> httpx.Timeout:
33
+ return httpx.Timeout(connect=connect, read=read, write=connect, pool=connect)
34
+
35
+
36
+ def make_redaction_client(
37
+ base_url: str | None = None,
38
+ hf_token: str | None = None,
39
+ ) -> Client:
40
+ """Return a gradio_client for the remote doc_redaction Space."""
41
+ url = (base_url or redaction_base_url()).rstrip("/")
42
+ token = hf_token if hf_token is not None else redaction_hf_token()
43
+ kwargs = {"httpx_kwargs": {"timeout": httpx_timeout()}}
44
+ if token:
45
+ return Client(url, hf_token=token, **kwargs)
46
+ return Client(url, **kwargs)
47
+
48
+
49
+ def _collect_paths(value: Any, out: list[str]) -> None:
50
+ if isinstance(value, str) and value.startswith("/"):
51
+ out.append(value)
52
+ elif isinstance(value, dict):
53
+ path = value.get("path")
54
+ if isinstance(path, str) and path.startswith("/"):
55
+ out.append(path)
56
+ for item in value.values():
57
+ _collect_paths(item, out)
58
+ elif isinstance(value, (list, tuple)):
59
+ for item in value:
60
+ _collect_paths(item, out)
61
+
62
+
63
+ def extract_server_paths(result: Any) -> list[str]:
64
+ """Walk a gradio_client predict result and collect server file paths."""
65
+ paths: list[str] = []
66
+ _collect_paths(result, paths)
67
+ seen: set[str] = set()
68
+ ordered: list[str] = []
69
+ for path in paths:
70
+ if path not in seen:
71
+ seen.add(path)
72
+ ordered.append(path)
73
+ return ordered
74
+
75
+
76
+ def download_gradio_files(
77
+ paths: list[str],
78
+ dest_dir: str | Path,
79
+ *,
80
+ base_url: str | None = None,
81
+ hf_token: str | None = None,
82
+ ) -> list[Path]:
83
+ """Download server paths from a Gradio Space into dest_dir."""
84
+ url = (base_url or redaction_base_url()).rstrip("/")
85
+ token = hf_token if hf_token is not None else redaction_hf_token()
86
+ headers: dict[str, str] = {}
87
+ if token:
88
+ headers["Authorization"] = f"Bearer {token.strip()}"
89
+
90
+ dest = Path(dest_dir)
91
+ dest.mkdir(parents=True, exist_ok=True)
92
+ downloaded: list[Path] = []
93
+
94
+ with httpx.Client(timeout=httpx_timeout(), headers=headers) as http:
95
+ for path in paths:
96
+ if not isinstance(path, str) or not path.startswith("/"):
97
+ continue
98
+ file_url = f"{url}/gradio_api/file={quote(path, safe='')}"
99
+ local_path = dest / Path(path).name
100
+ response = http.get(file_url)
101
+ response.raise_for_status()
102
+ local_path.write_bytes(response.content)
103
+ downloaded.append(local_path)
104
+ return downloaded
agent-redact/pi/session_logs.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Resolve Pi agent session JSONL logs for Gradio download and usage-log persistence."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ from pi_agent_config import ensure_session_dir
9
+ from pi_rpc_client import PiRpcClient, PiRpcError
10
+
11
+ from tools.aws_functions import upload_log_file_to_s3
12
+ from tools.config import (
13
+ RUN_AWS_FUNCTIONS,
14
+ S3_USAGE_LOGS_FOLDER,
15
+ SAVE_LOGS_TO_CSV,
16
+ USAGE_LOGS_FOLDER,
17
+ )
18
+
19
+
20
+ def _session_dir_root() -> Path:
21
+ return ensure_session_dir()
22
+
23
+
24
+ def pi_session_file_from_client(client: PiRpcClient | None) -> Path | None:
25
+ """Return the active Pi session JSONL path from RPC state, if readable."""
26
+ if client is None or not client.running:
27
+ return None
28
+ try:
29
+ state = client.get_state()
30
+ except PiRpcError:
31
+ return None
32
+ raw = state.get("sessionFile")
33
+ if not raw or str(raw).strip() in ("", "—"):
34
+ return None
35
+ path = Path(str(raw)).expanduser()
36
+ if not path.is_file():
37
+ return None
38
+ resolved = path.resolve(strict=False)
39
+ try:
40
+ resolved.relative_to(_session_dir_root())
41
+ except ValueError:
42
+ return None
43
+ return resolved
44
+
45
+
46
+ def _usage_log_archive_name(source: Path, session_hash: str = "") -> str:
47
+ if session_hash and str(session_hash).strip():
48
+ return f"{str(session_hash).strip()}_{source.name}"
49
+ return source.name
50
+
51
+
52
+ def copy_session_log_to_usage_folder(
53
+ source: Path,
54
+ *,
55
+ session_hash: str = "",
56
+ ) -> Path | None:
57
+ """Copy a Pi session JSONL into ``USAGE_LOGS_FOLDER`` (beside ``usage_log.csv``)."""
58
+ if not SAVE_LOGS_TO_CSV:
59
+ return None
60
+ usage_dir = Path(USAGE_LOGS_FOLDER)
61
+ usage_dir.mkdir(parents=True, exist_ok=True)
62
+ dest = usage_dir / _usage_log_archive_name(source, session_hash)
63
+ try:
64
+ shutil.copy2(source, dest)
65
+ except OSError:
66
+ return None
67
+ return dest.resolve()
68
+
69
+
70
+ def collect_session_log_download(client: PiRpcClient | None) -> str | None:
71
+ """Path suitable for ``gr.File`` download, or ``None`` if no log yet."""
72
+ path = pi_session_file_from_client(client)
73
+ if path is None:
74
+ return None
75
+ return str(path)
76
+
77
+
78
+ def persist_session_log(
79
+ client: PiRpcClient | None,
80
+ *,
81
+ session_hash: str = "",
82
+ ) -> Path | None:
83
+ """
84
+ Archive the active Pi session JSONL when local usage logging is enabled.
85
+
86
+ Copies into ``USAGE_LOGS_FOLDER`` when ``SAVE_LOGS_TO_CSV`` is true, then
87
+ uploads that copy to ``S3_USAGE_LOGS_FOLDER`` when ``RUN_AWS_FUNCTIONS`` is true.
88
+ """
89
+ if not SAVE_LOGS_TO_CSV:
90
+ return None
91
+ source = pi_session_file_from_client(client)
92
+ if source is None:
93
+ return None
94
+ archived = copy_session_log_to_usage_folder(source, session_hash=session_hash)
95
+ if archived is None:
96
+ return None
97
+ if RUN_AWS_FUNCTIONS:
98
+ upload_log_file_to_s3(str(archived), S3_USAGE_LOGS_FOLDER)
99
+ return archived
100
+
101
+
102
+ def export_session_log_to_s3(client: PiRpcClient | None) -> None:
103
+ """Back-compat: persist session log (local archive + optional S3)."""
104
+ persist_session_log(client)
105
+
106
+
107
+ def gradio_session_log_allowed_paths() -> list[str]:
108
+ """Directories Gradio must allow to serve Pi session JSONL files."""
109
+ paths: list[str] = []
110
+ try:
111
+ paths.append(str(_session_dir_root()))
112
+ except OSError:
113
+ pass
114
+ if SAVE_LOGS_TO_CSV:
115
+ try:
116
+ paths.append(str(Path(USAGE_LOGS_FOLDER).resolve()))
117
+ except OSError:
118
+ pass
119
+ return paths
agent-redact/pi/session_workspace.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Per-session workspace paths for the Pi Gradio UI (mirrors main app session folders)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import gradio as gr
11
+
12
+ _REPO_ROOT = Path(__file__).resolve().parents[2]
13
+ if str(_REPO_ROOT) not in sys.path:
14
+ sys.path.insert(0, str(_REPO_ROOT))
15
+
16
+ _SESSION_ID_RE = re.compile(r"[^a-zA-Z0-9_@.+-]+")
17
+
18
+
19
+ def workspace_base_dir() -> Path:
20
+ """Shared Pi workspace root (see ``bootstrap_pi_config.ensure_pi_workspace_dir``)."""
21
+ raw = (os.environ.get("PI_WORKSPACE_DIR") or "").strip()
22
+ if raw:
23
+ path = Path(raw)
24
+ else:
25
+ from bootstrap_pi_config import ensure_pi_workspace_dir
26
+
27
+ return Path(ensure_pi_workspace_dir(_REPO_ROOT))
28
+ path.mkdir(parents=True, exist_ok=True)
29
+ return path.resolve()
30
+
31
+
32
+ def _session_output_folder_enabled() -> bool:
33
+ """Read at call time so ``pi_agent.env`` / dotenv apply before first use."""
34
+ raw = (os.environ.get("SESSION_OUTPUT_FOLDER") or "").strip().lower()
35
+ return raw in {"1", "true", "yes", "on"}
36
+
37
+
38
+ def session_workspace_enabled() -> bool:
39
+ """
40
+ When true, each Gradio session uses ``{PI_WORKSPACE_DIR}/{session_hash}/``.
41
+
42
+ Controlled by ``PI_SESSION_WORKSPACE`` in ``config/pi_agent.env`` (default on when unset).
43
+ Set ``PI_SESSION_WORKSPACE=false`` for a single shared workspace root.
44
+ """
45
+ raw = os.environ.get("PI_SESSION_WORKSPACE", "").strip().lower()
46
+ if raw in {"0", "false", "no", "off"}:
47
+ return False
48
+ if raw in {"1", "true", "yes", "on"}:
49
+ return True
50
+ if _session_output_folder_enabled():
51
+ return True
52
+ return True
53
+
54
+
55
+ def workspace_base_dir_resolved() -> Path:
56
+ """Current workspace root (never cached at import)."""
57
+ return workspace_base_dir()
58
+
59
+
60
+ def sanitize_session_id(raw: str) -> str:
61
+ cleaned = _SESSION_ID_RE.sub("_", (raw or "").strip())[:128].strip("_")
62
+ return cleaned or "default"
63
+
64
+
65
+ def resolve_session_hash(request: gr.Request | None) -> str:
66
+ """
67
+ Resolve Gradio session id for per-user workspace folders.
68
+
69
+ Prefers ``request.session_hash`` (local Pi UI). Falls back to the main app's
70
+ Cognito/OIDC resolver when a deployment header is configured.
71
+ """
72
+ if request is None:
73
+ return "default"
74
+ gradio_hash = getattr(request, "session_hash", None)
75
+ if gradio_hash is not None and str(gradio_hash).strip():
76
+ return sanitize_session_id(str(gradio_hash))
77
+ from tools.gradio_platform import resolve_session_identity
78
+
79
+ try:
80
+ identity = resolve_session_identity(request)
81
+ except ValueError:
82
+ return "default"
83
+ return sanitize_session_id(str(identity))
84
+
85
+
86
+ def effective_session_hash(
87
+ session_hash: str,
88
+ request: gr.Request | None = None,
89
+ ) -> str:
90
+ """
91
+ Use ``session_hash_state`` when set; otherwise resolve from the active request.
92
+
93
+ Gradio ``demo.load`` may run before ``request.session_hash`` exists, so handlers
94
+ should pass ``request`` and call this on each event.
95
+ """
96
+ stored = (session_hash or "").strip()
97
+ if stored and stored != "default":
98
+ return sanitize_session_id(stored)
99
+ if request is not None:
100
+ resolved = resolve_session_hash(request)
101
+ if resolved and resolved != "default":
102
+ return resolved
103
+ if stored:
104
+ return sanitize_session_id(stored)
105
+ return "default"
106
+
107
+
108
+ def session_workspace_status_markdown(session_hash: str) -> str:
109
+ """Markdown for the workspace panel."""
110
+ workspace = ensure_session_workspace(session_hash)
111
+ path = workspace.as_posix()
112
+ if session_workspace_enabled():
113
+ return (
114
+ f"**Session id:** `{session_hash}` \n" f"**Your workspace:** `{path}/` \n"
115
+ )
116
+ return f"**Workspace:** `{path}/`"
117
+
118
+
119
+ def prepare_session_workspace(
120
+ session_hash: str,
121
+ request: gr.Request | None = None,
122
+ ) -> tuple[str, Path, str]:
123
+ """
124
+ Resolve session id, create ``{PI_WORKSPACE_DIR}/{hash}/``, return status text.
125
+
126
+ Call at the start of redaction (and on page load) so the folder always exists.
127
+ """
128
+ effective = effective_session_hash(session_hash, request)
129
+ workspace = ensure_session_workspace(effective)
130
+ return effective, workspace, session_workspace_status_markdown(effective)
131
+
132
+
133
+ def session_s3_outputs_prefix(session_hash: str) -> str:
134
+ """Session-scoped S3 output prefix (shared env vars with main app)."""
135
+ from tools.gradio_platform import build_s3_outputs_prefix
136
+
137
+ return build_s3_outputs_prefix(
138
+ session_hash,
139
+ session_scoped=session_workspace_enabled(),
140
+ )
141
+
142
+
143
+ def session_workspace_dir(session_hash: str) -> Path:
144
+ base = workspace_base_dir().resolve()
145
+ if not session_workspace_enabled():
146
+ return base
147
+ safe_id = sanitize_session_id(session_hash)
148
+ candidate = (base / safe_id).resolve()
149
+ try:
150
+ candidate.relative_to(base)
151
+ except ValueError:
152
+ return (base / "default").resolve()
153
+ return candidate
154
+
155
+
156
+ def ensure_session_workspace(session_hash: str) -> Path:
157
+ workspace = session_workspace_dir(session_hash)
158
+ workspace.mkdir(parents=True, exist_ok=True)
159
+ return workspace
160
+
161
+
162
+ def init_session_workspace(
163
+ request: gr.Request,
164
+ ) -> tuple[str, gr.FileExplorer, str, str]:
165
+ """
166
+ App-load handler: create the session subfolder and scope the file explorer.
167
+
168
+ Returns ``(session_hash, file_explorer_update, status_markdown, s3_output_prefix)``.
169
+ """
170
+ session_hash, workspace, status = prepare_session_workspace("", request)
171
+ s3_prefix = session_s3_outputs_prefix(session_hash)
172
+
173
+ return (
174
+ session_hash,
175
+ gr.FileExplorer(root_dir=workspace.as_posix()),
176
+ status,
177
+ s3_prefix,
178
+ )
179
+
180
+
181
+ def workspace_context_prefix(session_hash: str) -> str:
182
+ """Prefix Pi prompts so the agent uses the session workspace."""
183
+ if not session_workspace_enabled() or not session_hash.strip():
184
+ return ""
185
+ root = session_workspace_dir(session_hash).as_posix().rstrip("/")
186
+ return (
187
+ f"**Session workspace (mandatory):** all uploads, downloads, and redaction "
188
+ f"artifacts for this user must live under `{root}/`. "
189
+ f"Use `{root}/redact/<document>/` for per-document output trees. "
190
+ f"Do not write to `{root}/output_final_download/` (UI-managed download copies only). "
191
+ f"Do not read or write other session folders under `{workspace_base_dir().as_posix()}/`.\n\n"
192
+ )
agent-redact/pi/start.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Start Gradio Pi chat UI in the background; keep container alive for `docker compose exec pi-agent pi`.
3
+ set -euo pipefail
4
+
5
+ export HOME="${HOME:-/home/node}"
6
+ export PI_WORKDIR="${PI_WORKDIR:-/workspace/doc_redaction}"
7
+ export PYTHONPATH="${PI_WORKDIR}:${PI_WORKDIR}/agent-redact/pi:${PYTHONPATH:-}"
8
+
9
+ cd "$PI_WORKDIR"
10
+
11
+ export APP_TYPE="${APP_TYPE:-pi}"
12
+ export APP_CONFIG_PATH="${APP_CONFIG_PATH:-$PI_WORKDIR/config/pi_agent.env}"
13
+
14
+ mkdir -p "${PI_WORKSPACE_DIR:-/home/user/app/workspace}"
15
+ python3 agent-redact/pi/pi_agent_config.py
16
+
17
+ if [ "${RUN_FASTAPI:-False}" = "True" ]; then
18
+ exec uvicorn gradio_app:app \
19
+ --app-dir agent-redact/pi \
20
+ --host "${GRADIO_SERVER_NAME:-0.0.0.0}" \
21
+ --port "${PI_GRADIO_PORT:-${GRADIO_SERVER_PORT:-7862}}"
22
+ else
23
+ python3 agent-redact/pi/gradio_app.py &
24
+ fi
25
+
26
+ wait -n
agent-redact/requirements_pi_agent.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python stack for the pi-agent Docker image (orchestration + Pi Gradio UI).
2
+ #
3
+ # Excludes spaCy, Presidio, and OCR stacks — heavy redaction runs in redaction-app-llama.
4
+ # Includes full Gradio for agent-redact/pi/gradio_app.py (chat frontend over Pi RPC mode).
5
+ #
6
+ # Version caps align with requirements_lightweight.txt where packages overlap.
7
+
8
+ # --- Gradio UI + API client ---
9
+ gradio>=6.9.0,<=6.10.0
10
+ gradio-client>=2.0.0,<=2.4.0
11
+ httpx<=0.28.1
12
+ requests<=2.34.2
13
+ starlette>=0.52.1
14
+
15
+ # --- Config ---
16
+ python-dotenv<=1.2.2
17
+
18
+ # --- CSV / tabular review (skills, page-review merge) ---
19
+ numpy<=2.4.4
20
+ pandas<=2.3.3
21
+ openpyxl<=3.1.5
22
+
23
+ # --- PDF helpers (verify_redaction_coverage, preview scripts) ---
24
+ pymupdf<=1.27.1
25
+
26
+ # --- General utilities ---
27
+ tabulate<=0.10.0
28
+ rapidfuzz<=3.14.5
29
+ defusedxml<=0.7.1
30
+
31
+ # --- Shared platform features (logging, Cognito, S3 via tools/) ---
32
+ boto3<=1.42.61
33
+ bleach<=6.3.0
34
+ fastapi>=0.115.0
35
+ uvicorn>=0.34.0
agent_routes.py ADDED
@@ -0,0 +1,1167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI routes for programmatic / agent callers.
3
+
4
+ HTTP paths align with Gradio ``api_name`` values in app.py. See GET /agent/operations
5
+ for the full map. Uses cli_redact.main(direct_mode_args=...) where a CLI task exists.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from fastapi import APIRouter, Depends, Header, HTTPException
17
+ from fastapi.responses import JSONResponse
18
+ from pydantic import BaseModel, Field, field_validator
19
+
20
+ from tools.config import (
21
+ AWS_LLM_PII_OPTION,
22
+ AWS_PII_OPTION,
23
+ INFERENCE_SERVER_PII_OPTION,
24
+ INPUT_FOLDER,
25
+ LOCAL_OCR_MODEL_OPTIONS,
26
+ LOCAL_PII_OPTION,
27
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
28
+ OUTPUT_FOLDER,
29
+ )
30
+ from tools.secure_path_utils import validate_path_safety
31
+
32
+ router = APIRouter(tags=["Agent"])
33
+
34
+ REPO_ROOT = Path(__file__).resolve().parent
35
+ _MAX_INSTRUCTION_LEN = 16_000
36
+
37
+ # NOTE: Paths from request bodies are untrusted. Avoid Path.resolve() on untrusted
38
+ # input (CodeQL py/path-injection); instead normalize via os.path and enforce
39
+ # containment under trusted roots.
40
+
41
+ # Mirrors app.py api_name values (Gradio).
42
+ GRADIO_API_NAMES: tuple[str, ...] = (
43
+ "redact_document",
44
+ "load_and_prepare_documents_or_data",
45
+ "apply_review_redactions",
46
+ "review_apply",
47
+ "pdf_summarise",
48
+ "tabular_redact",
49
+ "word_level_ocr_text_search",
50
+ "redact_data",
51
+ "find_duplicate_pages",
52
+ "find_duplicate_tabular",
53
+ "summarise_document",
54
+ "combine_review_csvs",
55
+ "combine_review_pdfs",
56
+ "export_review_redaction_overlay",
57
+ "export_review_page_ocr_visualisation",
58
+ "verify_redaction_coverage",
59
+ )
60
+
61
+
62
+ def _allowed_path_roots() -> list[Path]:
63
+ # Return roots without resolving. These are trusted config values, but avoiding
64
+ # Path.resolve() keeps CodeQL happy and matches our "no resolve on untrusted"
65
+ # approach elsewhere.
66
+ roots = [REPO_ROOT]
67
+ for folder in (INPUT_FOLDER, OUTPUT_FOLDER):
68
+ if folder:
69
+ roots.append(Path(str(folder)))
70
+ return roots
71
+
72
+
73
+ def _sanitize_untrusted_path_input(path_str: str) -> str:
74
+ """Basic raw-input validation before any path normalization."""
75
+ if not isinstance(path_str, str):
76
+ raise HTTPException(status_code=400, detail="Path must be a string.")
77
+ cleaned = path_str.strip()
78
+ if not cleaned:
79
+ raise HTTPException(status_code=400, detail="Path must not be empty.")
80
+ if "\x00" in cleaned:
81
+ raise HTTPException(status_code=400, detail="Path contains invalid null byte.")
82
+ return cleaned
83
+
84
+
85
+ def _normalize_untrusted_path_to_abs(path_str: str) -> str:
86
+ """
87
+ Expand ~, then normalize to an absolute path.
88
+
89
+ Relative paths are interpreted relative to REPO_ROOT (matching prior behaviour).
90
+ """
91
+ safe_input = _sanitize_untrusted_path_input(path_str)
92
+ expanded = os.path.expanduser(safe_input)
93
+ if os.path.isabs(expanded):
94
+ return os.path.normpath(os.path.abspath(expanded))
95
+ return os.path.normpath(os.path.abspath(os.path.join(str(REPO_ROOT), expanded)))
96
+
97
+
98
+ def _must_be_under_allowed_roots(candidate_abs: str, original: str) -> None:
99
+ """Enforce candidate is contained under repo, INPUT_FOLDER, or OUTPUT_FOLDER."""
100
+ candidate_real = os.path.realpath(str(candidate_abs))
101
+ allowed_roots = [
102
+ os.path.realpath(os.path.abspath(str(p))) for p in _allowed_path_roots()
103
+ ]
104
+ for root in allowed_roots:
105
+ try:
106
+ common = os.path.commonpath([candidate_real, root])
107
+ except ValueError:
108
+ # Different drive on Windows or invalid path mix
109
+ continue
110
+ if common == root:
111
+ return
112
+ raise HTTPException(
113
+ status_code=403,
114
+ detail="Path must be under the app repo, INPUT_FOLDER, or OUTPUT_FOLDER",
115
+ )
116
+
117
+
118
+ def _path_must_be_allowed_file(path_str: str) -> str:
119
+ """Resolve path, ensure it is under an allowed root and exists as a file."""
120
+ candidate_abs = _normalize_untrusted_path_to_abs(path_str)
121
+ candidate_real = os.path.realpath(candidate_abs)
122
+
123
+ # Validate both "safe path" patterns and containment under trusted roots.
124
+ _must_be_under_allowed_roots(candidate_real, path_str)
125
+ ok = any(
126
+ validate_path_safety(candidate_real, base_path=str(root))
127
+ for root in _allowed_path_roots()
128
+ )
129
+ if not ok:
130
+ raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
131
+ try:
132
+ candidate_path = Path(candidate_real)
133
+ if not candidate_path.is_file():
134
+ raise HTTPException(
135
+ status_code=400, detail=f"Not a file or missing: {candidate_real}"
136
+ )
137
+ except OSError:
138
+ raise HTTPException(
139
+ status_code=400, detail=f"Not a file or missing: {candidate_real}"
140
+ )
141
+ return candidate_real
142
+
143
+
144
+ def _path_must_be_allowed_directory(path_str: str, *, must_exist: bool = True) -> str:
145
+ """
146
+ Normalize and validate a directory path under allowed roots.
147
+
148
+ By default the directory must already exist; callers can opt out (e.g. output_dir
149
+ that will be created later by the CLI).
150
+ """
151
+ candidate_abs = _normalize_untrusted_path_to_abs(path_str)
152
+ candidate_real = os.path.realpath(candidate_abs)
153
+
154
+ _must_be_under_allowed_roots(candidate_real, path_str)
155
+ ok = any(
156
+ validate_path_safety(candidate_real, base_path=str(root))
157
+ for root in _allowed_path_roots()
158
+ )
159
+ if not ok:
160
+ raise HTTPException(status_code=400, detail=f"Unsafe path rejected: {path_str}")
161
+ if must_exist:
162
+ try:
163
+ if not Path(candidate_real).is_dir():
164
+ raise HTTPException(
165
+ status_code=400, detail=f"Not a directory: {candidate_real}"
166
+ )
167
+ except OSError:
168
+ raise HTTPException(
169
+ status_code=400, detail=f"Not a directory: {candidate_real}"
170
+ )
171
+ return candidate_real
172
+
173
+
174
+ def _optional_agent_api_key(x_agent_api_key: Optional[str] = Header(None)) -> None:
175
+ expected = os.environ.get("AGENT_API_KEY", "").strip()
176
+ if not expected:
177
+ return
178
+ if not x_agent_api_key or x_agent_api_key.strip() != expected:
179
+ raise HTTPException(
180
+ status_code=401,
181
+ detail="Set header X-Agent-API-Key to match AGENT_API_KEY environment variable",
182
+ )
183
+
184
+
185
+ class AgentRedactDocumentRequest(BaseModel):
186
+ """Parity with Gradio api_name ``redact_document``."""
187
+
188
+ input_files: list[str] = Field(
189
+ ...,
190
+ min_length=1,
191
+ description="Paths to input files (PDF, images, or tabular/Word for anonymisation)",
192
+ )
193
+ instruction: Optional[str] = Field(
194
+ None,
195
+ description="Optional instructions for LLM-based PII detection (custom_llm_instructions)",
196
+ )
197
+ output_dir: Optional[str] = None
198
+ input_dir: Optional[str] = None
199
+ ocr_method: Optional[str] = Field(
200
+ None,
201
+ description=(
202
+ "High-level OCR/text mode. Accepted values: 'Local OCR', "
203
+ "'AWS Textract', 'Local text'. To choose a specific local OCR engine "
204
+ "(e.g. paddle/tesseract/vlm), set "
205
+ "overrides.chosen_local_ocr_model."
206
+ ),
207
+ )
208
+ pii_detector: Optional[str] = Field(
209
+ None,
210
+ description=(
211
+ "PII detection method. Recommended configured labels: "
212
+ f"'{LOCAL_PII_OPTION}', '{AWS_PII_OPTION}', '{AWS_LLM_PII_OPTION}', "
213
+ f"'{INFERENCE_SERVER_PII_OPTION}', '{LOCAL_TRANSFORMERS_LLM_PII_OPTION}', "
214
+ "'None'."
215
+ ),
216
+ )
217
+ overrides: Optional[dict[str, Any]] = Field(
218
+ None,
219
+ description=(
220
+ "Optional CLI flag overrides; keys must match argparse destination names. "
221
+ "For local OCR model selection, set 'chosen_local_ocr_model' "
222
+ f"(allowed models depend on deployment; configured options: {LOCAL_OCR_MODEL_OPTIONS})."
223
+ ),
224
+ )
225
+
226
+ model_config = {
227
+ "json_schema_extra": {
228
+ "examples": [
229
+ {
230
+ "input_files": [
231
+ "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
232
+ ],
233
+ "instruction": "Do not redact the university name.",
234
+ "ocr_method": "Local OCR",
235
+ "pii_detector": LOCAL_PII_OPTION,
236
+ "overrides": {"chosen_local_ocr_model": "paddle"},
237
+ }
238
+ ]
239
+ }
240
+ }
241
+
242
+ @field_validator("instruction")
243
+ @classmethod
244
+ def _cap_instruction(cls, v: Optional[str]) -> Optional[str]:
245
+ if v is None:
246
+ return v
247
+ if len(v) > _MAX_INSTRUCTION_LEN:
248
+ raise ValueError(f"instruction exceeds {_MAX_INSTRUCTION_LEN} characters")
249
+ return v
250
+
251
+
252
+ class AgentRedactDataRequest(AgentRedactDocumentRequest):
253
+ """Parity with Gradio api_name ``redact_data``; same CLI task as redact_document."""
254
+
255
+
256
+ class AgentTaskResponse(BaseModel):
257
+ status: str
258
+ gradio_api_name: str
259
+ task: str
260
+ output_dir: str
261
+ input_dir: str
262
+ message: str
263
+ log_excerpt: Optional[str] = None
264
+ output_paths: Optional[list[str]] = None
265
+
266
+
267
+ class AgentVerifyRedactionRequest(BaseModel):
268
+ review_csv_path: str = Field(..., description="Path to *_review_file.csv")
269
+ ocr_words_csv_path: str = Field(
270
+ ..., description="Path to *_ocr_results_with_words_*.csv from the same run"
271
+ )
272
+ must_redact: Optional[List[str]] = Field(
273
+ None,
274
+ description="Regex patterns for terms that must be covered by review boxes.",
275
+ )
276
+ must_not_redact: Optional[List[str]] = Field(
277
+ None,
278
+ description="Regex patterns for terms that must not appear in review rows.",
279
+ )
280
+ redacted_pdf_path: Optional[str] = Field(
281
+ None, description="Optional applied *_redacted.pdf for text-layer leak checks."
282
+ )
283
+ total_pages: Optional[int] = Field(None, ge=1)
284
+ min_word_length: int = Field(3, ge=1, le=32)
285
+ sample_pixels: bool = Field(
286
+ False,
287
+ description="Sample pixel darkness at box centres on redacted PDF (requires redacted_pdf_path).",
288
+ )
289
+ auto_prune_suspicious: bool = Field(
290
+ False,
291
+ description="Remove prunable suspicious short/OCR-fragment rows and write pruned CSV.",
292
+ )
293
+ pruned_output_path: Optional[str] = Field(
294
+ None,
295
+ description="Output path for pruned CSV when auto_prune_suspicious is true.",
296
+ )
297
+
298
+
299
+ class AgentVerifyRedactionResponse(BaseModel):
300
+ status: str
301
+ gradio_api_name: str = "verify_redaction_coverage"
302
+ coverage_pass: bool
303
+ coverage_pass_strict: bool
304
+ coverage_pass_with_cleanup: bool
305
+ pruned_csv_path: Optional[str] = None
306
+ prune_log: Optional[Dict[str, Any]] = None
307
+ report: Dict[str, Any]
308
+
309
+
310
+ class AgentWordLevelOcrSearchRequest(BaseModel):
311
+ ocr_words_csv_path: str = Field(
312
+ ..., description="Path to *_ocr_results_with_words_*.csv"
313
+ )
314
+ search_text: str = Field(..., min_length=3, max_length=500)
315
+ similarity_threshold: float = Field(1.0, ge=0.0, le=1.0)
316
+ use_regex: bool = False
317
+ review_csv_path: Optional[str] = Field(
318
+ None,
319
+ description="Optional *_review_file.csv to flag whether each hit is covered by a box.",
320
+ )
321
+
322
+
323
+ class AgentWordLevelOcrSearchResponse(BaseModel):
324
+ status: str
325
+ gradio_api_name: str = "word_level_ocr_text_search"
326
+ result: Dict[str, Any]
327
+
328
+
329
+ def _merge_redact_direct_mode(body: AgentRedactDocumentRequest) -> dict[str, Any]:
330
+ from cli_redact import get_cli_default_args_dict
331
+
332
+ merged: dict[str, Any] = get_cli_default_args_dict()
333
+ merged["task"] = "redact"
334
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
335
+
336
+ if body.instruction is not None:
337
+ merged["custom_llm_instructions"] = body.instruction
338
+ if body.output_dir is not None:
339
+ # Output folders may not exist yet (CLI will create). Still constrain to allowed roots.
340
+ merged["output_dir"] = _path_must_be_allowed_directory(
341
+ body.output_dir, must_exist=False
342
+ )
343
+ if body.input_dir is not None:
344
+ # Input dir should exist if provided.
345
+ merged["input_dir"] = _path_must_be_allowed_directory(
346
+ body.input_dir, must_exist=True
347
+ )
348
+ if body.ocr_method is not None:
349
+ merged["ocr_method"] = body.ocr_method
350
+ if body.pii_detector is not None:
351
+ merged["pii_detector"] = body.pii_detector
352
+
353
+ if body.overrides:
354
+ allowed = set(merged.keys())
355
+ for key, value in body.overrides.items():
356
+ if key not in allowed:
357
+ raise HTTPException(
358
+ status_code=400,
359
+ detail=f"Unknown override key '{key}'. Must be a known CLI argument name.",
360
+ )
361
+ merged[key] = value
362
+
363
+ return merged
364
+
365
+
366
+ def _run_cli_main(direct: dict[str, Any], gradio_api_name: str) -> AgentTaskResponse:
367
+ from cli_redact import main as cli_main
368
+
369
+ buf = io.StringIO()
370
+ old_stdout = sys.stdout
371
+ try:
372
+ sys.stdout = buf
373
+ cli_main(direct_mode_args=direct)
374
+ except Exception as e:
375
+ raise HTTPException(status_code=500, detail=str(e)) from e
376
+ finally:
377
+ sys.stdout = old_stdout
378
+
379
+ log_excerpt = buf.getvalue()
380
+ if len(log_excerpt) > 8000:
381
+ log_excerpt = log_excerpt[-8000:]
382
+
383
+ return AgentTaskResponse(
384
+ status="completed",
385
+ gradio_api_name=gradio_api_name,
386
+ task=str(direct.get("task", "")),
387
+ output_dir=str(direct.get("output_dir", "")),
388
+ input_dir=str(direct.get("input_dir", "")),
389
+ message="cli_redact.main finished; see log_excerpt for console output",
390
+ log_excerpt=log_excerpt or None,
391
+ )
392
+
393
+
394
+ @router.post(
395
+ "/redact_document",
396
+ response_model=AgentTaskResponse,
397
+ summary="redact_document (Gradio api_name)",
398
+ description=(
399
+ "Matches Gradio ``api_name='redact_document'``. "
400
+ "``python cli_redact.py --task redact --input_file ...``. "
401
+ "Optional ``instruction`` maps to ``custom_llm_instructions``. "
402
+ "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
403
+ "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
404
+ f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
405
+ "PII methods should use configured labels shown on the request schema."
406
+ ),
407
+ )
408
+ def post_redact_document(
409
+ body: AgentRedactDocumentRequest,
410
+ _: None = Depends(_optional_agent_api_key),
411
+ ) -> AgentTaskResponse:
412
+ direct = _merge_redact_direct_mode(body)
413
+ return _run_cli_main(direct, "redact_document")
414
+
415
+
416
+ @router.post(
417
+ "/redact_data",
418
+ response_model=AgentTaskResponse,
419
+ summary="redact_data (Gradio api_name)",
420
+ description=(
421
+ "Matches Gradio ``api_name='redact_data'``. Same CLI ``redact`` task as "
422
+ "/redact_document; use CSV/XLSX/DOCX paths for tabular/Word flows. "
423
+ "OCR modes: 'Local OCR' | 'AWS Textract' | 'Local text'. "
424
+ "Specific local OCR engines are set via ``overrides.chosen_local_ocr_model`` "
425
+ f"(for example: {LOCAL_OCR_MODEL_OPTIONS}). "
426
+ "PII methods should use configured labels shown on the request schema."
427
+ ),
428
+ )
429
+ def post_redact_data(
430
+ body: AgentRedactDataRequest,
431
+ _: None = Depends(_optional_agent_api_key),
432
+ ) -> AgentTaskResponse:
433
+ direct = _merge_redact_direct_mode(body)
434
+ return _run_cli_main(direct, "redact_data")
435
+
436
+
437
+ @router.post(
438
+ "/tasks/redact",
439
+ response_model=AgentTaskResponse,
440
+ summary="Legacy: same as /redact_document",
441
+ description="Deprecated alias; prefer POST /agent/redact_document.",
442
+ deprecated=True,
443
+ include_in_schema=True,
444
+ )
445
+ def post_tasks_redact_legacy(
446
+ body: AgentRedactDocumentRequest,
447
+ _: None = Depends(_optional_agent_api_key),
448
+ ) -> AgentTaskResponse:
449
+ direct = _merge_redact_direct_mode(body)
450
+ return _run_cli_main(direct, "redact_document")
451
+
452
+
453
+ class AgentFindDuplicatePagesRequest(BaseModel):
454
+ input_files: list[str] = Field(..., min_length=1)
455
+ similarity_threshold: Optional[float] = None
456
+ min_word_count: Optional[int] = None
457
+ min_consecutive_pages: Optional[int] = None
458
+ greedy_match: Optional[bool] = None
459
+ combine_pages: Optional[bool] = None
460
+ overrides: Optional[dict[str, Any]] = None
461
+
462
+
463
+ @router.post(
464
+ "/find_duplicate_pages",
465
+ response_model=AgentTaskResponse,
466
+ summary="find_duplicate_pages (Gradio api_name)",
467
+ description="``cli_redact --task deduplicate --duplicate_type pages``.",
468
+ )
469
+ def post_find_duplicate_pages(
470
+ body: AgentFindDuplicatePagesRequest,
471
+ _: None = Depends(_optional_agent_api_key),
472
+ ) -> AgentTaskResponse:
473
+ from cli_redact import get_cli_default_args_dict
474
+
475
+ merged = get_cli_default_args_dict()
476
+ merged["task"] = "deduplicate"
477
+ merged["duplicate_type"] = "pages"
478
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
479
+ if body.similarity_threshold is not None:
480
+ merged["similarity_threshold"] = body.similarity_threshold
481
+ if body.min_word_count is not None:
482
+ merged["min_word_count"] = body.min_word_count
483
+ if body.min_consecutive_pages is not None:
484
+ merged["min_consecutive_pages"] = body.min_consecutive_pages
485
+ if body.greedy_match is not None:
486
+ merged["greedy_match"] = "True" if body.greedy_match else "False"
487
+ if body.combine_pages is not None:
488
+ merged["combine_pages"] = "True" if body.combine_pages else "False"
489
+ if body.overrides:
490
+ allowed = set(merged.keys())
491
+ for k, v in body.overrides.items():
492
+ if k not in allowed:
493
+ raise HTTPException(400, f"Unknown override key: {k}")
494
+ merged[k] = v
495
+ return _run_cli_main(merged, "find_duplicate_pages")
496
+
497
+
498
+ class AgentFindDuplicateTabularRequest(BaseModel):
499
+ input_files: list[str] = Field(..., min_length=1)
500
+ text_columns: Optional[list[str]] = None
501
+ similarity_threshold: Optional[float] = None
502
+ min_word_count: Optional[int] = None
503
+ overrides: Optional[dict[str, Any]] = None
504
+
505
+
506
+ @router.post(
507
+ "/find_duplicate_tabular",
508
+ response_model=AgentTaskResponse,
509
+ summary="find_duplicate_tabular (Gradio api_name)",
510
+ )
511
+ def post_find_duplicate_tabular(
512
+ body: AgentFindDuplicateTabularRequest,
513
+ _: None = Depends(_optional_agent_api_key),
514
+ ) -> AgentTaskResponse:
515
+ from cli_redact import get_cli_default_args_dict
516
+
517
+ merged = get_cli_default_args_dict()
518
+ merged["task"] = "deduplicate"
519
+ merged["duplicate_type"] = "tabular"
520
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
521
+ if body.text_columns is not None:
522
+ merged["text_columns"] = body.text_columns
523
+ if body.similarity_threshold is not None:
524
+ merged["similarity_threshold"] = body.similarity_threshold
525
+ if body.min_word_count is not None:
526
+ merged["min_word_count"] = body.min_word_count
527
+ if body.overrides:
528
+ allowed = set(merged.keys())
529
+ for k, v in body.overrides.items():
530
+ if k not in allowed:
531
+ raise HTTPException(400, f"Unknown override key: {k}")
532
+ merged[k] = v
533
+ return _run_cli_main(merged, "find_duplicate_tabular")
534
+
535
+
536
+ class AgentSummariseDocumentRequest(BaseModel):
537
+ input_files: list[str] = Field(..., min_length=1)
538
+ summarisation_inference_method: Optional[str] = None
539
+ summarisation_format: Optional[str] = None
540
+ summarisation_context: Optional[str] = None
541
+ summarisation_additional_instructions: Optional[str] = None
542
+ overrides: Optional[dict[str, Any]] = None
543
+
544
+
545
+ @router.post(
546
+ "/summarise_document",
547
+ response_model=AgentTaskResponse,
548
+ summary="summarise_document (Gradio api_name)",
549
+ )
550
+ def post_summarise_document(
551
+ body: AgentSummariseDocumentRequest,
552
+ _: None = Depends(_optional_agent_api_key),
553
+ ) -> AgentTaskResponse:
554
+ from cli_redact import get_cli_default_args_dict
555
+
556
+ merged = get_cli_default_args_dict()
557
+ merged["task"] = "summarise"
558
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
559
+ if body.summarisation_inference_method is not None:
560
+ merged["summarisation_inference_method"] = body.summarisation_inference_method
561
+ if body.summarisation_format is not None:
562
+ merged["summarisation_format"] = body.summarisation_format
563
+ if body.summarisation_context is not None:
564
+ merged["summarisation_context"] = body.summarisation_context
565
+ if body.summarisation_additional_instructions is not None:
566
+ merged["summarisation_additional_instructions"] = (
567
+ body.summarisation_additional_instructions
568
+ )
569
+ if body.overrides:
570
+ allowed = set(merged.keys())
571
+ for k, v in body.overrides.items():
572
+ if k not in allowed:
573
+ raise HTTPException(400, f"Unknown override key: {k}")
574
+ merged[k] = v
575
+ return _run_cli_main(merged, "summarise_document")
576
+
577
+
578
+ class AgentCombineReviewPdfsRequest(BaseModel):
579
+ input_files: list[str] = Field(..., min_length=2)
580
+ output_dir: Optional[str] = None
581
+
582
+
583
+ @router.post(
584
+ "/combine_review_pdfs",
585
+ response_model=AgentTaskResponse,
586
+ summary="combine_review_pdfs (Gradio api_name)",
587
+ )
588
+ def post_combine_review_pdfs(
589
+ body: AgentCombineReviewPdfsRequest,
590
+ _: None = Depends(_optional_agent_api_key),
591
+ ) -> AgentTaskResponse:
592
+ from cli_redact import get_cli_default_args_dict
593
+
594
+ merged = get_cli_default_args_dict()
595
+ merged["task"] = "combine_review_pdfs"
596
+ merged["input_file"] = [_path_must_be_allowed_file(p) for p in body.input_files]
597
+ if body.output_dir is not None:
598
+ merged["output_dir"] = _path_must_be_allowed_directory(body.output_dir)
599
+ return _run_cli_main(merged, "combine_review_pdfs")
600
+
601
+
602
+ class _NamedPath:
603
+ """merge_csv_files expects objects with a .name attribute (Gradio file-like)."""
604
+
605
+ __slots__ = ("name",)
606
+
607
+ def __init__(self, path: str) -> None:
608
+ self.name = path
609
+
610
+
611
+ class AgentCombineReviewCsvsRequest(BaseModel):
612
+ input_files: list[str] = Field(..., min_length=1)
613
+ output_dir: Optional[str] = Field(
614
+ None, description="Defaults to config OUTPUT_FOLDER"
615
+ )
616
+
617
+
618
+ class AgentApplyReviewRedactionsRequest(BaseModel):
619
+ """Headless parity with Gradio ``api_name='apply_review_redactions'`` (prepare + apply)."""
620
+
621
+ pdf_path: str = Field(
622
+ ...,
623
+ description="Path to the source PDF under allowed roots.",
624
+ )
625
+ review_csv_path: str = Field(
626
+ ...,
627
+ description=(
628
+ "Path to the review plan CSV; basename must contain '_review_file' "
629
+ "(e.g. mydoc_review_file.csv)."
630
+ ),
631
+ )
632
+ output_dir: Optional[str] = Field(
633
+ None,
634
+ description="Output directory (created if missing); defaults to OUTPUT_FOLDER.",
635
+ )
636
+ input_dir: Optional[str] = Field(
637
+ None,
638
+ description="Input/working directory for page images; defaults to INPUT_FOLDER.",
639
+ )
640
+ text_extract_method: Optional[str] = Field(
641
+ None,
642
+ description="OCR/text mode passed to prepare (defaults to CLI ocr_method).",
643
+ )
644
+ efficient_ocr: Optional[bool] = Field(
645
+ None,
646
+ description="If set, overrides EFFICIENT_OCR for the prepare step.",
647
+ )
648
+
649
+
650
+ @router.post(
651
+ "/combine_review_csvs",
652
+ response_model=AgentTaskResponse,
653
+ summary="combine_review_csvs (Gradio api_name)",
654
+ description="Uses tools.helper_functions.merge_csv_files (not cli_redact).",
655
+ )
656
+ def post_combine_review_csvs(
657
+ body: AgentCombineReviewCsvsRequest,
658
+ _: None = Depends(_optional_agent_api_key),
659
+ ) -> AgentTaskResponse:
660
+ from tools.helper_functions import merge_csv_files
661
+
662
+ paths = [_NamedPath(_path_must_be_allowed_file(p)) for p in body.input_files]
663
+ out_dir = body.output_dir or OUTPUT_FOLDER
664
+ out_dir_resolved = _path_must_be_allowed_directory(str(out_dir), must_exist=True)
665
+ sep = "/" if not out_dir_resolved.endswith(("/", "\\")) else ""
666
+ out_files = merge_csv_files(paths, output_folder=out_dir_resolved + sep)
667
+ return AgentTaskResponse(
668
+ status="completed",
669
+ gradio_api_name="combine_review_csvs",
670
+ task="combine_review_csvs",
671
+ output_dir=out_dir_resolved,
672
+ input_dir="",
673
+ message="merge_csv_files completed",
674
+ output_paths=out_files,
675
+ )
676
+
677
+
678
+ class AgentExportReviewRedactionOverlayRequest(BaseModel):
679
+ """Agent JSON body for the same overlay render as Gradio ``api_name='page_redaction_review_image'``."""
680
+
681
+ page_image_path: str = Field(
682
+ ...,
683
+ description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
684
+ )
685
+ boxes: List[Dict[str, Any]] = Field(
686
+ ...,
687
+ min_length=1,
688
+ description="Annotator-style boxes: label, color, xmin, ymin, xmax, ymax (normalized 0–1).",
689
+ )
690
+ page_number: int = Field(
691
+ 1, ge=1, description="1-based page index for the output filename."
692
+ )
693
+ doc_base_name: str = Field(
694
+ "review",
695
+ description="Basename for output file (e.g. document name without extension).",
696
+ )
697
+ review_df_records: Optional[List[Dict[str, Any]]] = Field(
698
+ None,
699
+ description="Optional rows (include at least 'label') for stable label→line-pattern mapping.",
700
+ )
701
+ label_abbrev_chars: Optional[int] = Field(
702
+ None,
703
+ ge=0,
704
+ le=24,
705
+ description="Draw this many leading characters of each label on the image; omit to use REVIEW_OVERLAY_LABEL_ABBREV_CHARS from config (0 = off).",
706
+ )
707
+
708
+
709
+ class AgentExportReviewPageOcrVisualisationRequest(BaseModel):
710
+ """Agent JSON body for the same OCR visualisation as Gradio ``api_name='page_ocr_review_image'``."""
711
+
712
+ page_image_path: str = Field(
713
+ ...,
714
+ description="Path to page raster (PNG/JPEG) used as underlay; must be under allowed roots.",
715
+ )
716
+ ocr_results: Dict[str, Any] = Field(
717
+ ...,
718
+ description="Word-level OCR results dict (line_key -> {words:[{text, bounding_box, conf, ...}]}).",
719
+ )
720
+ page_number: int = Field(
721
+ 1, ge=1, description="1-based page index (used for naming)."
722
+ )
723
+ doc_base_name: str = Field(
724
+ "review",
725
+ description="Basename for output file (e.g. document name without extension).",
726
+ )
727
+
728
+
729
+ @router.post(
730
+ "/export_review_redaction_overlay",
731
+ response_model=AgentTaskResponse,
732
+ summary="export_review_redaction_overlay (Agent API; Gradio api_name: page_redaction_review_image)",
733
+ description=(
734
+ "Renders hollow redaction outlines and a top-right legend on the page image; "
735
+ "writes ``redaction_overlay/{doc_base_name}_page{n}_redaction_overlay.jpg`` under OUTPUT_FOLDER "
736
+ "(scaled per REVIEW_OVERLAY_MAX_PIXELS, JPEG capped by REVIEW_OVERLAY_MAX_FILE_BYTES). "
737
+ "Uses ``tools.redaction_review.visualise_review_redaction_boxes``."
738
+ ),
739
+ )
740
+ def post_export_review_redaction_overlay(
741
+ body: AgentExportReviewRedactionOverlayRequest,
742
+ _: None = Depends(_optional_agent_api_key),
743
+ ) -> AgentTaskResponse:
744
+ import pandas as pd
745
+
746
+ from tools.redaction_review import visualise_review_redaction_boxes
747
+
748
+ img_path = _path_must_be_allowed_file(body.page_image_path)
749
+ annotator: dict[str, Any] = {"image": img_path, "boxes": body.boxes}
750
+ review_df = (
751
+ pd.DataFrame(body.review_df_records)
752
+ if body.review_df_records
753
+ else pd.DataFrame()
754
+ )
755
+ out_folder_abs = os.path.realpath(
756
+ os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
757
+ )
758
+ if not validate_path_safety(out_folder_abs):
759
+ raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
760
+ _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
761
+ try:
762
+ Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
763
+ except OSError:
764
+ raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
765
+ out_folder = out_folder_abs
766
+
767
+ path = visualise_review_redaction_boxes(
768
+ annotator,
769
+ review_df=review_df,
770
+ output_folder=out_folder,
771
+ page_number=body.page_number,
772
+ doc_base_name=body.doc_base_name,
773
+ label_abbrev_chars=body.label_abbrev_chars,
774
+ )
775
+ if not path:
776
+ raise HTTPException(
777
+ status_code=500,
778
+ detail=(
779
+ "Could not produce overlay PNG (invalid image/boxes or write failed). "
780
+ "Ensure boxes are valid and the image loads."
781
+ ),
782
+ )
783
+ return AgentTaskResponse(
784
+ status="completed",
785
+ gradio_api_name="export_review_redaction_overlay",
786
+ task="export_review_redaction_overlay",
787
+ output_dir=out_folder,
788
+ input_dir="",
789
+ message="Redaction overlay PNG written",
790
+ output_paths=[path],
791
+ )
792
+
793
+
794
+ @router.post(
795
+ "/export_review_page_ocr_visualisation",
796
+ response_model=AgentTaskResponse,
797
+ summary="export_review_page_ocr_visualisation (Agent API; Gradio api_name: page_ocr_review_image)",
798
+ description=(
799
+ "Renders a per-page OCR visualisation using tools.file_redaction.visualise_ocr_words_bounding_boxes; "
800
+ "writes under OUTPUT_FOLDER/review_ocr_visualisations/."
801
+ ),
802
+ )
803
+ def post_export_review_page_ocr_visualisation(
804
+ body: AgentExportReviewPageOcrVisualisationRequest,
805
+ _: None = Depends(_optional_agent_api_key),
806
+ ) -> AgentTaskResponse:
807
+ from PIL import Image
808
+
809
+ from tools.file_redaction import visualise_ocr_words_bounding_boxes
810
+
811
+ img_path = _path_must_be_allowed_file(body.page_image_path)
812
+
813
+ out_folder_abs = os.path.realpath(
814
+ os.path.abspath(os.path.expanduser(str(OUTPUT_FOLDER)))
815
+ )
816
+ if not validate_path_safety(out_folder_abs):
817
+ raise HTTPException(status_code=400, detail="Unsafe OUTPUT_FOLDER path")
818
+ _must_be_under_allowed_roots(out_folder_abs, str(out_folder_abs))
819
+ try:
820
+ Path(out_folder_abs).mkdir(parents=True, exist_ok=True)
821
+ except OSError:
822
+ raise HTTPException(status_code=500, detail="Could not create OUTPUT_FOLDER")
823
+ out_folder = out_folder_abs
824
+
825
+ safe_base = str(body.doc_base_name or "review")
826
+ image_name = f"{safe_base}_page{int(body.page_number)}.png"
827
+ log_paths: list[str] = []
828
+ try:
829
+ log_paths = visualise_ocr_words_bounding_boxes(
830
+ Image.open(img_path).convert("RGB"),
831
+ body.ocr_results,
832
+ image_name=image_name,
833
+ output_folder=out_folder,
834
+ visualisation_folder="review_ocr_visualisations",
835
+ add_legend=True,
836
+ log_files_output_paths=log_paths,
837
+ )
838
+ except Exception as e:
839
+ raise HTTPException(status_code=500, detail=str(e)) from e
840
+
841
+ if not log_paths:
842
+ raise HTTPException(
843
+ status_code=500,
844
+ detail="Could not produce OCR visualisation (invalid image/ocr_results or write failed).",
845
+ )
846
+ out_path = log_paths[-1]
847
+ return AgentTaskResponse(
848
+ status="completed",
849
+ gradio_api_name="export_review_page_ocr_visualisation",
850
+ task="export_review_page_ocr_visualisation",
851
+ output_dir=out_folder,
852
+ input_dir="",
853
+ message="OCR visualisation written",
854
+ output_paths=[out_path],
855
+ )
856
+
857
+
858
+ def _gradio_only(api_name: str, detail: str) -> JSONResponse:
859
+ return JSONResponse(
860
+ status_code=501,
861
+ content={
862
+ "gradio_api_name": api_name,
863
+ "detail": detail,
864
+ "hint": (
865
+ "This flow is Gradio-session stateful. Call the named route on the "
866
+ "Gradio HTTP API, not /agent."
867
+ ),
868
+ "gradio_http": {
869
+ "discover_schema": "GET /gradio_api/info",
870
+ "start_call": f"POST /gradio_api/call/{api_name}",
871
+ "request_body_shape": '{"data": [<args in schema order>]}',
872
+ "poll": f"GET /gradio_api/call/{api_name}/{{event_id}}",
873
+ },
874
+ "gradio_client_notes": [
875
+ "Pass api_name explicitly; do not rely on inferring the endpoint from "
876
+ "Python function names (large Blocks apps will look ambiguous).",
877
+ "If predict() still cannot resolve the route, open GET /gradio_api/info "
878
+ "and use the numeric fn_index with gradio_client, or call the HTTP "
879
+ "endpoints directly.",
880
+ "The length of data must match the parameter list for this deployment; "
881
+ "copy order and types from /gradio_api/info.",
882
+ ],
883
+ },
884
+ )
885
+
886
+
887
+ @router.post("/load_and_prepare_documents_or_data")
888
+ def post_load_and_prepare_documents_or_data() -> JSONResponse:
889
+ return _gradio_only(
890
+ "load_and_prepare_documents_or_data",
891
+ "Preparation uses Gradio session state and prepare_image_or_pdf_with_efficient_ocr; no single CLI task.",
892
+ )
893
+
894
+
895
+ @router.post(
896
+ "/apply_review_redactions",
897
+ response_model=AgentTaskResponse,
898
+ summary="apply_review_redactions (Gradio api_name)",
899
+ description=(
900
+ "Runs prepare_image_or_pdf_with_efficient_ocr([pdf, review_csv]) then "
901
+ "apply_redactions_to_review_df_and_files — same core pipeline as the Review tab, "
902
+ "without Gradio session state. Requires paths under allowed roots."
903
+ ),
904
+ )
905
+ def post_apply_review_redactions(
906
+ body: AgentApplyReviewRedactionsRequest,
907
+ _: None = Depends(_optional_agent_api_key),
908
+ ) -> AgentTaskResponse:
909
+ from tools.simplified_api import run_apply_review_redactions
910
+
911
+ pdf = _path_must_be_allowed_file(body.pdf_path)
912
+ csv = _path_must_be_allowed_file(body.review_csv_path)
913
+ out_dir: str | None = None
914
+ if body.output_dir is not None:
915
+ out_dir = _path_must_be_allowed_directory(body.output_dir, must_exist=False)
916
+ in_dir: str | None = None
917
+ if body.input_dir is not None:
918
+ in_dir = _path_must_be_allowed_directory(body.input_dir, must_exist=False)
919
+
920
+ try:
921
+ result = run_apply_review_redactions(
922
+ pdf_path=pdf,
923
+ review_csv_path=csv,
924
+ output_dir=out_dir,
925
+ input_dir=in_dir,
926
+ text_extract_method=body.text_extract_method,
927
+ efficient_ocr=body.efficient_ocr,
928
+ )
929
+ except ValueError as e:
930
+ raise HTTPException(status_code=400, detail=str(e)) from e
931
+ except Exception as e:
932
+ raise HTTPException(
933
+ status_code=500,
934
+ detail=f"apply_review_redactions failed: {e}",
935
+ ) from e
936
+
937
+ return AgentTaskResponse(
938
+ status="completed",
939
+ gradio_api_name="apply_review_redactions",
940
+ task="apply_review_redactions",
941
+ output_dir=result["output_dir"],
942
+ input_dir=result["input_dir"],
943
+ message=result["message"],
944
+ output_paths=result.get("output_paths"),
945
+ )
946
+
947
+
948
+ @router.post(
949
+ "/verify_redaction_coverage",
950
+ response_model=AgentVerifyRedactionResponse,
951
+ summary="verify_redaction_coverage (Pass 1 programmatic QA)",
952
+ )
953
+ def post_verify_redaction_coverage(
954
+ body: AgentVerifyRedactionRequest,
955
+ _: None = Depends(_optional_agent_api_key),
956
+ ) -> AgentVerifyRedactionResponse:
957
+ from tools.simplified_api import run_verify_redaction_coverage
958
+
959
+ review = _path_must_be_allowed_file(body.review_csv_path)
960
+ ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
961
+ redacted = None
962
+ if body.redacted_pdf_path:
963
+ redacted = _path_must_be_allowed_file(body.redacted_pdf_path)
964
+ try:
965
+ report, pruned_csv_path, prune_log = run_verify_redaction_coverage(
966
+ review_csv_path=review,
967
+ ocr_words_csv_path=ocr_words,
968
+ must_redact=body.must_redact,
969
+ must_not_redact=body.must_not_redact,
970
+ redacted_pdf_path=redacted,
971
+ total_pages=body.total_pages,
972
+ min_word_length=body.min_word_length,
973
+ sample_pixels=body.sample_pixels,
974
+ auto_prune_suspicious=body.auto_prune_suspicious,
975
+ pruned_output_path=body.pruned_output_path,
976
+ )
977
+ except ValueError as e:
978
+ raise HTTPException(status_code=400, detail=str(e)) from e
979
+ except Exception as e:
980
+ raise HTTPException(
981
+ status_code=500, detail=f"verify_redaction_coverage failed: {e}"
982
+ ) from e
983
+ return AgentVerifyRedactionResponse(
984
+ status="completed",
985
+ coverage_pass=bool(report.get("pass_strict", report.get("pass"))),
986
+ coverage_pass_strict=bool(report.get("pass_strict", report.get("pass"))),
987
+ coverage_pass_with_cleanup=bool(report.get("pass_with_cleanup")),
988
+ pruned_csv_path=pruned_csv_path,
989
+ prune_log=prune_log,
990
+ report=report,
991
+ )
992
+
993
+
994
+ @router.post(
995
+ "/word_level_ocr_text_search",
996
+ response_model=AgentWordLevelOcrSearchResponse,
997
+ summary="word_level_ocr_text_search (headless OCR CSV search)",
998
+ )
999
+ def post_word_level_ocr_text_search(
1000
+ body: AgentWordLevelOcrSearchRequest,
1001
+ _: None = Depends(_optional_agent_api_key),
1002
+ ) -> AgentWordLevelOcrSearchResponse:
1003
+ from tools.simplified_api import run_word_level_ocr_text_search_api
1004
+
1005
+ ocr_words = _path_must_be_allowed_file(body.ocr_words_csv_path)
1006
+ review = None
1007
+ if body.review_csv_path:
1008
+ review = _path_must_be_allowed_file(body.review_csv_path)
1009
+ try:
1010
+ result = run_word_level_ocr_text_search_api(
1011
+ ocr_words_csv_path=ocr_words,
1012
+ search_text=body.search_text,
1013
+ similarity_threshold=body.similarity_threshold,
1014
+ use_regex=body.use_regex,
1015
+ review_csv_path=review,
1016
+ )
1017
+ except ValueError as e:
1018
+ raise HTTPException(status_code=400, detail=str(e)) from e
1019
+ except Exception as e:
1020
+ raise HTTPException(
1021
+ status_code=500, detail=f"word_level_ocr_text_search failed: {e}"
1022
+ ) from e
1023
+ return AgentWordLevelOcrSearchResponse(status="completed", result=result)
1024
+
1025
+
1026
+ @router.get("/operations")
1027
+ def list_operations() -> dict[str, Any]:
1028
+ return {
1029
+ "gradio_api_names": list(GRADIO_API_NAMES),
1030
+ "gradio_session_state_endpoints": {
1031
+ "description": (
1032
+ "These api_name values are exposed on the Gradio HTTP API but return "
1033
+ "501 on /agent because they depend on in-memory Gradio state."
1034
+ ),
1035
+ "discover_schema": "GET /gradio_api/info",
1036
+ "call_pattern": 'POST /gradio_api/call/<api_name> with JSON body {"data": [...]}',
1037
+ "names": [
1038
+ "load_and_prepare_documents_or_data",
1039
+ ],
1040
+ },
1041
+ "routes": [
1042
+ {
1043
+ "gradio_api_name": "redact_document",
1044
+ "method": "POST",
1045
+ "path": "/agent/redact_document",
1046
+ "implementation": "cli_redact task redact",
1047
+ "notes": {
1048
+ "ocr_method": [
1049
+ "Local OCR",
1050
+ "AWS Textract",
1051
+ "Local text",
1052
+ ],
1053
+ "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
1054
+ "pii_detector_recommended": [
1055
+ LOCAL_PII_OPTION,
1056
+ AWS_PII_OPTION,
1057
+ AWS_LLM_PII_OPTION,
1058
+ INFERENCE_SERVER_PII_OPTION,
1059
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
1060
+ "None",
1061
+ ],
1062
+ },
1063
+ },
1064
+ {
1065
+ "gradio_api_name": "redact_data",
1066
+ "method": "POST",
1067
+ "path": "/agent/redact_data",
1068
+ "implementation": "cli_redact task redact",
1069
+ "notes": {
1070
+ "ocr_method": [
1071
+ "Local OCR",
1072
+ "AWS Textract",
1073
+ "Local text",
1074
+ ],
1075
+ "chosen_local_ocr_model_override": LOCAL_OCR_MODEL_OPTIONS,
1076
+ "pii_detector_recommended": [
1077
+ LOCAL_PII_OPTION,
1078
+ AWS_PII_OPTION,
1079
+ AWS_LLM_PII_OPTION,
1080
+ INFERENCE_SERVER_PII_OPTION,
1081
+ LOCAL_TRANSFORMERS_LLM_PII_OPTION,
1082
+ "None",
1083
+ ],
1084
+ },
1085
+ },
1086
+ {
1087
+ "gradio_api_name": "find_duplicate_pages",
1088
+ "method": "POST",
1089
+ "path": "/agent/find_duplicate_pages",
1090
+ "implementation": "cli_redact deduplicate pages",
1091
+ },
1092
+ {
1093
+ "gradio_api_name": "find_duplicate_tabular",
1094
+ "method": "POST",
1095
+ "path": "/agent/find_duplicate_tabular",
1096
+ "implementation": "cli_redact deduplicate tabular",
1097
+ },
1098
+ {
1099
+ "gradio_api_name": "summarise_document",
1100
+ "method": "POST",
1101
+ "path": "/agent/summarise_document",
1102
+ "implementation": "cli_redact task summarise",
1103
+ },
1104
+ {
1105
+ "gradio_api_name": "combine_review_pdfs",
1106
+ "method": "POST",
1107
+ "path": "/agent/combine_review_pdfs",
1108
+ "implementation": "cli_redact combine_review_pdfs",
1109
+ },
1110
+ {
1111
+ "gradio_api_name": "export_review_redaction_overlay",
1112
+ "method": "POST",
1113
+ "path": "/agent/export_review_redaction_overlay",
1114
+ "implementation": "visualise_review_redaction_boxes",
1115
+ },
1116
+ {
1117
+ "gradio_api_name": "export_review_page_ocr_visualisation",
1118
+ "method": "POST",
1119
+ "path": "/agent/export_review_page_ocr_visualisation",
1120
+ "implementation": "visualise_ocr_words_bounding_boxes",
1121
+ },
1122
+ {
1123
+ "gradio_api_name": "combine_review_csvs",
1124
+ "method": "POST",
1125
+ "path": "/agent/combine_review_csvs",
1126
+ "implementation": "helper merge_csv_files",
1127
+ },
1128
+ {
1129
+ "gradio_api_name": "load_and_prepare_documents_or_data",
1130
+ "method": "POST",
1131
+ "path": "/agent/load_and_prepare_documents_or_data",
1132
+ "implementation": "not_implemented_http",
1133
+ },
1134
+ {
1135
+ "gradio_api_name": "apply_review_redactions",
1136
+ "method": "POST",
1137
+ "path": "/agent/apply_review_redactions",
1138
+ "implementation": "tools.simplified_api.run_apply_review_redactions",
1139
+ },
1140
+ {
1141
+ "gradio_api_name": "verify_redaction_coverage",
1142
+ "method": "POST",
1143
+ "path": "/agent/verify_redaction_coverage",
1144
+ "implementation": "tools.verify_redaction_coverage.verify_redaction_coverage",
1145
+ "notes": {
1146
+ "purpose": "Pass 1 programmatic QA — pass_strict (policy), pass_with_cleanup (+ suspicious rows), optional prune and text/pixel checks.",
1147
+ "must_redact": "list of regex strings",
1148
+ "must_not_redact": "list of regex strings",
1149
+ "auto_prune_suspicious": "remove short OCR-fragment rows before reporting",
1150
+ "pages_flagged_for_vlm": "policy/visual failures only",
1151
+ "pages_needing_csv_cleanup": "suspicious rows — prune, not VLM",
1152
+ "leak_likely_causes": "per-page hints when text_layer_leaks (coord_not_normalized, missing_page_boxes, etc.) — not a broken /review_apply",
1153
+ },
1154
+ },
1155
+ {
1156
+ "gradio_api_name": "word_level_ocr_text_search",
1157
+ "method": "POST",
1158
+ "path": "/agent/word_level_ocr_text_search",
1159
+ "implementation": "tools.verify_redaction_coverage.run_word_level_ocr_text_search",
1160
+ },
1161
+ ],
1162
+ }
1163
+
1164
+
1165
+ @router.get("/health")
1166
+ def agent_health() -> dict[str, str]:
1167
+ return {"status": "ok", "service": "agent"}
app.py ADDED
The diff for this file is too large to render. See raw diff
 
cdk/__init__.py ADDED
File without changes
cdk/app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from aws_cdk import App, Environment
4
+ from cdk_appregistry import register_doc_redaction_application
5
+ from cdk_config import (
6
+ ALB_NAME,
7
+ APPREGISTRY_APPLICATION_NAME,
8
+ APPREGISTRY_ATTRIBUTE_GROUP_NAME,
9
+ APPREGISTRY_DESCRIPTION,
10
+ APPREGISTRY_REPOSITORY_URL,
11
+ APPREGISTRY_STACK_NAME,
12
+ AWS_ACCOUNT_ID,
13
+ AWS_REGION,
14
+ CDK_CONTEXT_FILE,
15
+ CDK_PREFIX,
16
+ ENABLE_APPREGISTRY,
17
+ RUN_USEAST_STACK,
18
+ USE_CLOUDFRONT,
19
+ )
20
+ from cdk_functions import (
21
+ create_basic_config_env,
22
+ load_context_from_file,
23
+ log_aws_credential_context,
24
+ purge_cdk_lookup_context,
25
+ )
26
+ from cdk_stack import CdkStack, CdkStackCloudfront # , CdkStackMain
27
+ from check_resources import CONTEXT_FILE, check_and_set_context
28
+
29
+ # Initialize the CDK app
30
+ app = App()
31
+
32
+ log_aws_credential_context(
33
+ expected_account_id=AWS_ACCOUNT_ID,
34
+ expected_region=AWS_REGION,
35
+ )
36
+
37
+ # Drop stale CDK lookup cache entries (require bootstrap lookup role in target account).
38
+ purge_cdk_lookup_context(CDK_CONTEXT_FILE)
39
+
40
+ # --- Pre-check context (boto3) — written to precheck.context.json, NOT cdk.context.json ---
41
+ print(f"Pre-check context file: {CONTEXT_FILE}")
42
+ print(f"CDK lookup cache file: {CDK_CONTEXT_FILE}")
43
+ if os.path.basename(CONTEXT_FILE.replace("\\", "/")) == os.path.basename(
44
+ CDK_CONTEXT_FILE.replace("\\", "/")
45
+ ):
46
+ raise RuntimeError(
47
+ f"CONTEXT_FILE and CDK_CONTEXT_FILE must differ (got '{CONTEXT_FILE}' for both). "
48
+ "Set CONTEXT_FILE=precheck.context.json in config/cdk_config.env."
49
+ )
50
+
51
+ print("Running pre-check script to generate application context...")
52
+ try:
53
+ check_and_set_context()
54
+ if not os.path.exists(CONTEXT_FILE):
55
+ raise RuntimeError(
56
+ f"check_and_set_context() finished, but {CONTEXT_FILE} was not created."
57
+ )
58
+ print(f"Context generated successfully at {CONTEXT_FILE}.")
59
+ except Exception as e:
60
+ raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
61
+
62
+ # Pre-check must not repopulate CDK lookup keys; purge again if paths were ever shared.
63
+ purge_cdk_lookup_context(CDK_CONTEXT_FILE)
64
+
65
+ if os.path.exists(CONTEXT_FILE):
66
+ load_context_from_file(app, CONTEXT_FILE)
67
+ else:
68
+ raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
69
+
70
+ create_basic_config_env("config")
71
+
72
+ aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
73
+
74
+ regional_stack = CdkStack(
75
+ app, "RedactionStack", env=aws_env_regional, cross_region_references=True
76
+ )
77
+ regional_stack.termination_protection = True
78
+
79
+ if ENABLE_APPREGISTRY == "True":
80
+ # Use pre-check context only — not regional_stack.params (avoids AppRegistry
81
+ # -> RedactionStack dependency cycle during synth).
82
+ _alb_dns_context = app.node.try_get_context(f"dns:{ALB_NAME}")
83
+ _alb_dns_name = (
84
+ _alb_dns_context.strip()
85
+ if isinstance(_alb_dns_context, str) and _alb_dns_context.strip()
86
+ else None
87
+ )
88
+ appregistry_stack = register_doc_redaction_application(
89
+ app,
90
+ aws_account_id=AWS_ACCOUNT_ID,
91
+ aws_region=AWS_REGION,
92
+ application_name=APPREGISTRY_APPLICATION_NAME,
93
+ application_description=APPREGISTRY_DESCRIPTION,
94
+ appregistry_stack_name=APPREGISTRY_STACK_NAME,
95
+ attribute_group_name=APPREGISTRY_ATTRIBUTE_GROUP_NAME,
96
+ repository_url=APPREGISTRY_REPOSITORY_URL,
97
+ cdk_prefix=CDK_PREFIX,
98
+ use_cloudfront=USE_CLOUDFRONT,
99
+ alb_dns_name=_alb_dns_name,
100
+ )
101
+ appregistry_stack.termination_protection = True
102
+
103
+ if USE_CLOUDFRONT == "True" and RUN_USEAST_STACK == "True":
104
+ aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
105
+
106
+ cloudfront_stack = CdkStackCloudfront(
107
+ app,
108
+ "RedactionStackCloudfront",
109
+ env=aws_env_us_east_1,
110
+ alb_arn=regional_stack.params["alb_arn_output"],
111
+ alb_sec_group_id=regional_stack.params["alb_security_group_id"],
112
+ alb_dns_name=regional_stack.params["alb_dns_name"],
113
+ cross_region_references=True,
114
+ )
115
+
116
+ # CDK CLI invokes this script and expects a cloud assembly in cdk.out.
117
+ # Without app.synth(), Python defines constructs but never writes manifest.json
118
+ # (ENOENT on deploy). See: https://github.com/aws/aws-cdk/issues/11023
119
+ app.synth()
cdk/cdk.json.example ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "app": "python app.py",
3
+ "output": "cdk.out",
4
+ "context": {
5
+ "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": false
6
+ }
7
+ }
cdk/cdk_appregistry.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """AWS Console myApplications (Service Catalog AppRegistry) integration."""
2
+
3
+ from aws_cdk import App, Environment
4
+ from aws_cdk.aws_servicecatalogappregistry_alpha import (
5
+ ApplicationAssociator,
6
+ TargetApplication,
7
+ )
8
+
9
+
10
+ def register_doc_redaction_application(
11
+ app: App,
12
+ *,
13
+ aws_account_id: str,
14
+ aws_region: str,
15
+ application_name: str,
16
+ application_description: str,
17
+ appregistry_stack_name: str,
18
+ attribute_group_name: str,
19
+ repository_url: str,
20
+ cdk_prefix: str,
21
+ use_cloudfront: str,
22
+ alb_dns_name: str | None = None,
23
+ ) -> ApplicationAssociator:
24
+ """
25
+ Register regional CDK stacks with AWS Console myApplications.
26
+
27
+ Only stacks in ``aws_region`` are associated (phase 1). Cross-region stacks
28
+ such as RedactionStackCloudfront (us-east-1) are not included.
29
+
30
+ ``alb_dns_name`` must be a plain string (e.g. from pre-check context). Do not
31
+ pass a CloudFormation token from RedactionStack or synth will fail with a
32
+ dependency cycle against the associator stack.
33
+ """
34
+ associator = ApplicationAssociator(
35
+ app,
36
+ "DocRedactionAppRegistry",
37
+ applications=[
38
+ TargetApplication.create_application_stack(
39
+ application_name=application_name,
40
+ application_description=application_description,
41
+ stack_name=appregistry_stack_name,
42
+ env=Environment(account=aws_account_id, region=aws_region),
43
+ )
44
+ ],
45
+ )
46
+
47
+ attributes = {
48
+ "repository": repository_url,
49
+ "cdkPrefix": cdk_prefix,
50
+ "awsRegion": aws_region,
51
+ "useCloudFront": use_cloudfront,
52
+ "cloudFrontInAppRegistry": "false",
53
+ "cloudFrontNote": (
54
+ "CloudFront/WAF (RedactionStackCloudfront) is in us-east-1 and is "
55
+ "not linked to this myApplications entry in phase 1. View it in "
56
+ "CloudFormation (us-east-1) or the CloudFront console."
57
+ ),
58
+ }
59
+ if alb_dns_name:
60
+ attributes["albDnsName"] = alb_dns_name
61
+
62
+ associator.app_registry_application.add_attribute_group(
63
+ "DocRedactionAttributeGroup",
64
+ attribute_group_name=attribute_group_name,
65
+ description="doc_redaction deployment metadata",
66
+ attributes=attributes,
67
+ )
68
+
69
+ return associator
cdk/cdk_config.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import List
4
+
5
+ from dotenv import load_dotenv
6
+
7
+ # Set or retrieve configuration variables for CDK redaction deployment
8
+
9
+
10
+ def convert_string_to_boolean(value: str) -> bool:
11
+ """Convert string to boolean, handling various formats."""
12
+ if isinstance(value, bool):
13
+ return value
14
+ elif value in ["True", "1", "true", "TRUE"]:
15
+ return True
16
+ elif value in ["False", "0", "false", "FALSE"]:
17
+ return False
18
+ else:
19
+ raise ValueError(f"Invalid boolean value: {value}")
20
+
21
+
22
+ def parse_comma_separated_list(value: str) -> List[str]:
23
+ """Parse a comma-separated env value into a list of non-empty strings."""
24
+ if not value or not str(value).strip():
25
+ return []
26
+ cleaned = str(value).strip().strip("[]")
27
+ return [
28
+ part.strip().strip('"').strip("'")
29
+ for part in cleaned.split(",")
30
+ if part.strip()
31
+ ]
32
+
33
+
34
+ def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
35
+ """
36
+ Get an environmental variable, and set it to a default value if it doesn't exist
37
+ """
38
+ # Get the environment variable if it exists
39
+ value = os.environ.get(var_name)
40
+
41
+ # If it doesn't exist, set the environment variable to the default value
42
+ if value is None:
43
+ os.environ[var_name] = default_value
44
+ value = default_value
45
+
46
+ if print_val is True:
47
+ print(f"The value of {var_name} is {value}")
48
+
49
+ return value
50
+
51
+
52
+ def ensure_folder_exists(output_folder: str):
53
+ """Checks if the specified folder exists, creates it if not."""
54
+
55
+ if not os.path.exists(output_folder):
56
+ # Create the folder if it doesn't exist
57
+ os.makedirs(output_folder, exist_ok=True)
58
+ print(f"Created the {output_folder} folder.")
59
+ else:
60
+ print(f"The {output_folder} folder already exists.")
61
+
62
+
63
+ def add_folder_to_path(folder_path: str):
64
+ """
65
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
66
+ """
67
+
68
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
69
+ print(folder_path, "folder exists.")
70
+
71
+ # Resolve relative path to absolute path
72
+ absolute_path = os.path.abspath(folder_path)
73
+
74
+ current_path = os.environ["PATH"]
75
+ if absolute_path not in current_path.split(os.pathsep):
76
+ full_path_extension = absolute_path + os.pathsep + current_path
77
+ os.environ["PATH"] = full_path_extension
78
+ # print(f"Updated PATH with: ", full_path_extension)
79
+ else:
80
+ print(f"Directory {folder_path} already exists in PATH.")
81
+ else:
82
+ print(f"Folder not found at {folder_path} - not added to PATH")
83
+
84
+
85
+ ###
86
+ # LOAD CONFIG FROM ENV FILE
87
+ ###
88
+ CONFIG_FOLDER = get_or_create_env_var("CONFIG_FOLDER", "config/")
89
+
90
+ ensure_folder_exists(CONFIG_FOLDER)
91
+
92
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
93
+ CDK_CONFIG_PATH = get_or_create_env_var(
94
+ "CDK_CONFIG_PATH", "config/cdk_config.env"
95
+ ) # e.g. config/cdk_config.env
96
+
97
+ if CDK_CONFIG_PATH:
98
+ if os.path.exists(CDK_CONFIG_PATH):
99
+ print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
100
+ load_dotenv(CDK_CONFIG_PATH)
101
+ else:
102
+ print("CDK config file not found at location:", CDK_CONFIG_PATH)
103
+
104
+ ###
105
+ # AWS OPTIONS
106
+ ###
107
+ AWS_REGION = get_or_create_env_var("AWS_REGION", "")
108
+ AWS_ACCOUNT_ID = get_or_create_env_var("AWS_ACCOUNT_ID", "")
109
+
110
+ ###
111
+ # CDK OPTIONS
112
+ ###
113
+ CDK_PREFIX = get_or_create_env_var("CDK_PREFIX", "")
114
+
115
+ # AWS Console myApplications (Service Catalog AppRegistry)
116
+ ENABLE_APPREGISTRY = get_or_create_env_var("ENABLE_APPREGISTRY", "True")
117
+ APPREGISTRY_APPLICATION_NAME = get_or_create_env_var(
118
+ "APPREGISTRY_APPLICATION_NAME", f"{CDK_PREFIX}doc-redaction"
119
+ )
120
+ APPREGISTRY_DESCRIPTION = get_or_create_env_var(
121
+ "APPREGISTRY_DESCRIPTION",
122
+ "PII document redaction app (ALB, ECS Fargate, Cognito, S3)",
123
+ )
124
+ APPREGISTRY_STACK_NAME = get_or_create_env_var(
125
+ "APPREGISTRY_STACK_NAME", f"{CDK_PREFIX}AppRegistryStack"
126
+ )
127
+ APPREGISTRY_ATTRIBUTE_GROUP_NAME = get_or_create_env_var(
128
+ "APPREGISTRY_ATTRIBUTE_GROUP_NAME",
129
+ f"{APPREGISTRY_APPLICATION_NAME}-metadata",
130
+ )
131
+ APPREGISTRY_REPOSITORY_URL = get_or_create_env_var(
132
+ "APPREGISTRY_REPOSITORY_URL",
133
+ "https://github.com/seanpedrick-case/doc_redaction",
134
+ )
135
+
136
+ _precheck_context_file = get_or_create_env_var("CONTEXT_FILE", "precheck.context.json")
137
+ # Never write boto3 pre-check output into CDK's lookup cache file (causes stale
138
+ # vpc-provider / load-balancer entries and wrong-account lookup validation errors).
139
+ if os.path.basename(_precheck_context_file.replace("\\", "/")) == "cdk.context.json":
140
+ print(
141
+ "WARNING: CONTEXT_FILE must not be 'cdk.context.json' (that file is CDK's "
142
+ "lookup cache). Using 'precheck.context.json' instead. Update "
143
+ "config/cdk_config.env and remove CONTEXT_FILE=cdk.context.json if set."
144
+ )
145
+ _precheck_context_file = "precheck.context.json"
146
+ CONTEXT_FILE = _precheck_context_file
147
+ CDK_CONTEXT_FILE = get_or_create_env_var("CDK_CONTEXT_FILE", "cdk.context.json")
148
+ CDK_FOLDER = get_or_create_env_var(
149
+ "CDK_FOLDER", ""
150
+ ) # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
151
+
152
+ # App runtime config (uploaded to S3 for legacy Fargate; inlined for ECS Express Mode)
153
+ _app_config_rel = os.path.join(CONFIG_FOLDER, "config.env").replace("\\", "/")
154
+ APP_CONFIG_ENV_FILE = get_or_create_env_var(
155
+ "APP_CONFIG_ENV_FILE",
156
+ (
157
+ os.path.normpath(os.path.join(CDK_FOLDER, _app_config_rel))
158
+ if CDK_FOLDER
159
+ else os.path.normpath(_app_config_rel)
160
+ ),
161
+ )
162
+ RUN_USEAST_STACK = get_or_create_env_var("RUN_USEAST_STACK", "False")
163
+
164
+ ### VPC and connections
165
+ VPC_NAME = get_or_create_env_var("VPC_NAME", "")
166
+ NEW_VPC_DEFAULT_NAME = get_or_create_env_var("NEW_VPC_DEFAULT_NAME", f"{CDK_PREFIX}vpc")
167
+ NEW_VPC_CIDR = get_or_create_env_var("NEW_VPC_CIDR", "") # "10.0.0.0/24"
168
+
169
+
170
+ EXISTING_IGW_ID = get_or_create_env_var("EXISTING_IGW_ID", "")
171
+ SINGLE_NAT_GATEWAY_ID = get_or_create_env_var("SINGLE_NAT_GATEWAY_ID", "")
172
+
173
+ ### SUBNETS / ROUTE TABLES / NAT GATEWAY
174
+ PUBLIC_SUBNETS_TO_USE = get_or_create_env_var(
175
+ "PUBLIC_SUBNETS_TO_USE", ""
176
+ ) # e.g. ['PublicSubnet1', 'PublicSubnet2']
177
+ PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
178
+ "PUBLIC_SUBNET_CIDR_BLOCKS", ""
179
+ ) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
180
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
181
+ "PUBLIC_SUBNET_AVAILABILITY_ZONES", ""
182
+ ) # e.g. ["eu-east-1b", "eu-east1b"]
183
+
184
+ PRIVATE_SUBNETS_TO_USE = get_or_create_env_var(
185
+ "PRIVATE_SUBNETS_TO_USE", ""
186
+ ) # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
187
+ PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var(
188
+ "PRIVATE_SUBNET_CIDR_BLOCKS", ""
189
+ ) # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
190
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var(
191
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES", ""
192
+ ) # e.g. ["eu-east-1b", "eu-east1b"]
193
+
194
+ ROUTE_TABLE_BASE_NAME = get_or_create_env_var(
195
+ "ROUTE_TABLE_BASE_NAME", f"{CDK_PREFIX}PrivateRouteTable"
196
+ )
197
+ NAT_GATEWAY_EIP_NAME = get_or_create_env_var(
198
+ "NAT_GATEWAY_EIP_NAME", f"{CDK_PREFIX}NatGatewayEip"
199
+ )
200
+ NAT_GATEWAY_NAME = get_or_create_env_var("NAT_GATEWAY_NAME", f"{CDK_PREFIX}NatGateway")
201
+
202
+ # IAM roles
203
+ AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var(
204
+ "AWS_MANAGED_TASK_ROLES_LIST",
205
+ '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs", "AmazonBedrockFullAccess"]',
206
+ )
207
+ POLICY_FILE_LOCATIONS = get_or_create_env_var(
208
+ "POLICY_FILE_LOCATIONS", ""
209
+ ) # e.g. '["config/sts_permissions.json"]'
210
+ POLICY_FILE_ARNS = get_or_create_env_var("POLICY_FILE_ARNS", "")
211
+
212
+ # GITHUB REPO
213
+ GITHUB_REPO_USERNAME = get_or_create_env_var("GITHUB_REPO_USERNAME", "seanpedrick-case")
214
+ GITHUB_REPO_NAME = get_or_create_env_var("GITHUB_REPO_NAME", "doc_redaction")
215
+ GITHUB_REPO_BRANCH = get_or_create_env_var("GITHUB_REPO_BRANCH", "main")
216
+
217
+ ### CODEBUILD
218
+ CODEBUILD_ROLE_NAME = get_or_create_env_var(
219
+ "CODEBUILD_ROLE_NAME", f"{CDK_PREFIX}CodeBuildRole"
220
+ )
221
+ CODEBUILD_PROJECT_NAME = get_or_create_env_var(
222
+ "CODEBUILD_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildProject"
223
+ )
224
+
225
+ ### ECR
226
+ ECR_REPO_NAME = get_or_create_env_var(
227
+ "ECR_REPO_NAME", "doc-redaction"
228
+ ) # Beware - cannot have underscores and must be lower case
229
+ ECR_CDK_REPO_NAME = get_or_create_env_var(
230
+ "ECR_CDK_REPO_NAME", f"{CDK_PREFIX}{ECR_REPO_NAME}".lower()
231
+ )
232
+
233
+ ### S3
234
+ S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var(
235
+ "S3_LOG_CONFIG_BUCKET_NAME", f"{CDK_PREFIX}s3-logs".lower()
236
+ ) # S3 bucket names need to be lower case
237
+ S3_OUTPUT_BUCKET_NAME = get_or_create_env_var(
238
+ "S3_OUTPUT_BUCKET_NAME", f"{CDK_PREFIX}s3-output".lower()
239
+ )
240
+
241
+ ### KMS KEYS FOR S3 AND SECRETS MANAGER
242
+ USE_CUSTOM_KMS_KEY = get_or_create_env_var("USE_CUSTOM_KMS_KEY", "1")
243
+ CUSTOM_KMS_KEY_NAME = get_or_create_env_var(
244
+ "CUSTOM_KMS_KEY_NAME", f"alias/{CDK_PREFIX}kms-key".lower()
245
+ )
246
+
247
+ ### ECS
248
+ FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var(
249
+ "FARGATE_TASK_DEFINITION_NAME", f"{CDK_PREFIX}FargateTaskDefinition"
250
+ )
251
+ TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var(
252
+ "TASK_DEFINITION_FILE_LOCATION", CDK_FOLDER + CONFIG_FOLDER + "task_definition.json"
253
+ )
254
+
255
+ CLUSTER_NAME = get_or_create_env_var("CLUSTER_NAME", f"{CDK_PREFIX}Cluster")
256
+ ECS_SERVICE_NAME = get_or_create_env_var("ECS_SERVICE_NAME", f"{CDK_PREFIX}ECSService")
257
+ ECS_TASK_ROLE_NAME = get_or_create_env_var(
258
+ "ECS_TASK_ROLE_NAME", f"{CDK_PREFIX}TaskRole"
259
+ )
260
+ ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var(
261
+ "ECS_TASK_EXECUTION_ROLE_NAME", f"{CDK_PREFIX}ExecutionRole"
262
+ )
263
+ ECS_SECURITY_GROUP_NAME = get_or_create_env_var(
264
+ "ECS_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupECS"
265
+ )
266
+ ECS_LOG_GROUP_NAME = get_or_create_env_var(
267
+ "ECS_LOG_GROUP_NAME", f"/ecs/{ECS_SERVICE_NAME}-logs".lower()
268
+ )
269
+
270
+ ECS_TASK_CPU_SIZE = get_or_create_env_var("ECS_TASK_CPU_SIZE", "1024")
271
+ ECS_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_TASK_MEMORY_SIZE", "4096")
272
+ ECS_USE_FARGATE_SPOT = get_or_create_env_var("USE_FARGATE_SPOT", "False")
273
+ ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var("ECS_READ_ONLY_FILE_SYSTEM", "True")
274
+
275
+ ### Cognito
276
+ COGNITO_USER_POOL_NAME = get_or_create_env_var(
277
+ "COGNITO_USER_POOL_NAME", f"{CDK_PREFIX}UserPool"
278
+ )
279
+ COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var(
280
+ "COGNITO_USER_POOL_CLIENT_NAME", f"{CDK_PREFIX}UserPoolClient"
281
+ )
282
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var(
283
+ "COGNITO_USER_POOL_CLIENT_SECRET_NAME", f"{CDK_PREFIX}ParamCognitoSecret"
284
+ )
285
+ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var(
286
+ "COGNITO_USER_POOL_DOMAIN_PREFIX", "redaction-app-domain"
287
+ ) # Should change this to something unique or you'll probably hit an error
288
+
289
+ COGNITO_REFRESH_TOKEN_VALIDITY = int(
290
+ get_or_create_env_var("COGNITO_REFRESH_TOKEN_VALIDITY", "480")
291
+ ) # Minutes
292
+ COGNITO_ID_TOKEN_VALIDITY = int(
293
+ get_or_create_env_var("COGNITO_ID_TOKEN_VALIDITY", "60")
294
+ ) # Minutes
295
+ COGNITO_ACCESS_TOKEN_VALIDITY = int(
296
+ get_or_create_env_var("COGNITO_ACCESS_TOKEN_VALIDITY", "60")
297
+ ) # Minutes
298
+
299
+ # Application load balancer
300
+ ALB_NAME = get_or_create_env_var(
301
+ "ALB_NAME", f"{CDK_PREFIX}Alb"[-32:]
302
+ ) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
303
+ ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var(
304
+ "ALB_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupALB"
305
+ )
306
+ ALB_TARGET_GROUP_NAME = get_or_create_env_var(
307
+ "ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}-tg"[-32:]
308
+ ) # Max 32 characters
309
+ EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var("EXISTING_LOAD_BALANCER_ARN", "")
310
+ EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var(
311
+ "EXISTING_LOAD_BALANCER_DNS", "placeholder_load_balancer_dns.net"
312
+ )
313
+
314
+ ## CLOUDFRONT
315
+ USE_CLOUDFRONT = get_or_create_env_var("USE_CLOUDFRONT", "True")
316
+ CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var(
317
+ "CLOUDFRONT_PREFIX_LIST_ID", "pl-93a247fa"
318
+ )
319
+ CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var(
320
+ "CLOUDFRONT_GEO_RESTRICTION", ""
321
+ ) # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
322
+ CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var(
323
+ "CLOUDFRONT_DISTRIBUTION_NAME", f"{CDK_PREFIX}CfDist"
324
+ )
325
+ CLOUDFRONT_DOMAIN = get_or_create_env_var(
326
+ "CLOUDFRONT_DOMAIN", "cloudfront_placeholder.net"
327
+ )
328
+
329
+
330
+ # Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
331
+ ACM_SSL_CERTIFICATE_ARN = get_or_create_env_var("ACM_SSL_CERTIFICATE_ARN", "")
332
+ SSL_CERTIFICATE_DOMAIN = get_or_create_env_var(
333
+ "SSL_CERTIFICATE_DOMAIN", ""
334
+ ) # e.g. example.com or www.example.com
335
+
336
+ # ECS Express Mode (opt-in HTTPS ingress without supplying ACM_SSL_CERTIFICATE_ARN).
337
+ # Pilot/dev: Express PrimaryContainer does not support S3 environmentFiles or Fargate mount points.
338
+ USE_ECS_EXPRESS_MODE = get_or_create_env_var("USE_ECS_EXPRESS_MODE", "False")
339
+ ECS_EXPRESS_SERVICE_NAME = get_or_create_env_var(
340
+ "ECS_EXPRESS_SERVICE_NAME", ECS_SERVICE_NAME
341
+ )
342
+ ECS_EXPRESS_HEALTH_CHECK_PATH = get_or_create_env_var(
343
+ "ECS_EXPRESS_HEALTH_CHECK_PATH", "/"
344
+ )
345
+ ECS_EXPRESS_INFRASTRUCTURE_ROLE_NAME = get_or_create_env_var(
346
+ "ECS_EXPRESS_INFRASTRUCTURE_ROLE_NAME", f"{CDK_PREFIX}ExpressInfraRole"
347
+ )
348
+ # After first deploy, set to ExpressServiceEndpoint output (https://...) if not using CloudFront.
349
+ ECS_EXPRESS_COGNITO_REDIRECT_BASE = get_or_create_env_var(
350
+ "ECS_EXPRESS_COGNITO_REDIRECT_BASE", ""
351
+ )
352
+
353
+ if USE_ECS_EXPRESS_MODE == "True" and ACM_SSL_CERTIFICATE_ARN:
354
+ raise ValueError(
355
+ "USE_ECS_EXPRESS_MODE=True cannot be used with ACM_SSL_CERTIFICATE_ARN set. "
356
+ "Clear ACM_SSL_CERTIFICATE_ARN or set USE_ECS_EXPRESS_MODE=False."
357
+ )
358
+
359
+ # ECS Service Connect (legacy Fargate only): VPC service-to-service HTTP to Gradio/FastAPI.
360
+ ENABLE_ECS_SERVICE_CONNECT = get_or_create_env_var(
361
+ "ENABLE_ECS_SERVICE_CONNECT", "False"
362
+ )
363
+ ECS_SERVICE_CONNECT_NAMESPACE = get_or_create_env_var(
364
+ "ECS_SERVICE_CONNECT_NAMESPACE",
365
+ (f"{CDK_PREFIX}local".lower().replace("_", "-").strip("-") or "redaction-local"),
366
+ )
367
+ ECS_SERVICE_CONNECT_DISCOVERY_NAME = get_or_create_env_var(
368
+ "ECS_SERVICE_CONNECT_DISCOVERY_NAME", "redaction"
369
+ )
370
+ # Optional friendly DNS label; defaults to discovery name when empty.
371
+ ECS_SERVICE_CONNECT_DNS_NAME = get_or_create_env_var("ECS_SERVICE_CONNECT_DNS_NAME", "")
372
+ # Client task security groups (at least one of IDs, names, or CDK prefixes required when SC on).
373
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS = get_or_create_env_var(
374
+ "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS", ""
375
+ )
376
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS_LIST = parse_comma_separated_list(
377
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS
378
+ )
379
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES = get_or_create_env_var(
380
+ "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES", ""
381
+ )
382
+ ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES = get_or_create_env_var(
383
+ "ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES", ""
384
+ )
385
+
386
+ # This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
387
+ if USE_CLOUDFRONT == "True":
388
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
389
+ "COGNITO_REDIRECTION_URL", "https://" + CLOUDFRONT_DOMAIN
390
+ )
391
+ elif SSL_CERTIFICATE_DOMAIN:
392
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
393
+ "COGNITO_REDIRECTION_URL", "https://" + SSL_CERTIFICATE_DOMAIN
394
+ )
395
+ elif USE_ECS_EXPRESS_MODE == "True":
396
+ _express_redirect_default = ECS_EXPRESS_COGNITO_REDIRECT_BASE or (
397
+ "https://" + EXISTING_LOAD_BALANCER_DNS
398
+ )
399
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
400
+ "COGNITO_REDIRECTION_URL", _express_redirect_default
401
+ )
402
+ else:
403
+ COGNITO_REDIRECTION_URL = get_or_create_env_var(
404
+ "COGNITO_REDIRECTION_URL", "https://" + EXISTING_LOAD_BALANCER_DNS
405
+ )
406
+
407
+ # Custom headers e.g. if routing traffic through Cloudfront
408
+ CUSTOM_HEADER = get_or_create_env_var(
409
+ "CUSTOM_HEADER", ""
410
+ ) # Retrieving or setting CUSTOM_HEADER
411
+ CUSTOM_HEADER_VALUE = get_or_create_env_var(
412
+ "CUSTOM_HEADER_VALUE", ""
413
+ ) # Retrieving or setting CUSTOM_HEADER_VALUE
414
+
415
+ # Firewall on top of load balancer
416
+ LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var(
417
+ "LOAD_BALANCER_WEB_ACL_NAME", f"{CDK_PREFIX}alb-web-acl"
418
+ )
419
+
420
+ # Firewall on top of CloudFront
421
+ WEB_ACL_NAME = get_or_create_env_var("WEB_ACL_NAME", f"{CDK_PREFIX}cloudfront-web-acl")
422
+
423
+ ###
424
+ # File I/O options
425
+ ###
426
+
427
+ OUTPUT_FOLDER = get_or_create_env_var("GRADIO_OUTPUT_FOLDER", "output/") # 'output/'
428
+ INPUT_FOLDER = get_or_create_env_var("GRADIO_INPUT_FOLDER", "input/") # 'input/'
429
+
430
+ # Allow for files to be saved in a temporary folder for increased security in some instances
431
+ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
432
+ # Create a temporary directory
433
+ with tempfile.TemporaryDirectory() as temp_dir:
434
+ print(f"Temporary directory created at: {temp_dir}")
435
+
436
+ if OUTPUT_FOLDER == "TEMP":
437
+ OUTPUT_FOLDER = temp_dir + "/"
438
+ if INPUT_FOLDER == "TEMP":
439
+ INPUT_FOLDER = temp_dir + "/"
440
+
441
+ ###
442
+ # LOGGING OPTIONS
443
+ ###
444
+
445
+ SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
446
+
447
+ ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
448
+ SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "True")
449
+ ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
450
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-access-logs".lower()
451
+ )
452
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
453
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-feedback-logs".lower()
454
+ )
455
+ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
456
+ "USAGE_LOG_DYNAMODB_TABLE_NAME", f"{CDK_PREFIX}dynamodb-usage-logs".lower()
457
+ )
458
+
459
+ ###
460
+ # REDACTION OPTIONS
461
+ ###
462
+
463
+ # Get some environment variables and Launch the Gradio app
464
+ COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
465
+
466
+ GRADIO_SERVER_PORT = int(get_or_create_env_var("GRADIO_SERVER_PORT", "7860"))
467
+
468
+ # Must match the named port mapping on the Fargate container (see cdk_stack.py).
469
+ ECS_SERVICE_CONNECT_PORT_MAPPING_NAME = get_or_create_env_var(
470
+ "ECS_SERVICE_CONNECT_PORT_MAPPING_NAME", f"port-{GRADIO_SERVER_PORT}"
471
+ )
472
+
473
+ # Suffix used with ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES (matches this stack's ECS SG name).
474
+ if ECS_SECURITY_GROUP_NAME.startswith(CDK_PREFIX):
475
+ _default_sc_client_sg_suffix = ECS_SECURITY_GROUP_NAME[len(CDK_PREFIX) :]
476
+ else:
477
+ _default_sc_client_sg_suffix = "SecurityGroupECS"
478
+ ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX = get_or_create_env_var(
479
+ "ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX", _default_sc_client_sg_suffix
480
+ )
481
+
482
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_LIST = parse_comma_separated_list(
483
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES
484
+ )
485
+ ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES_LIST = parse_comma_separated_list(
486
+ ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES
487
+ )
488
+
489
+
490
+ def build_service_connect_client_security_group_names() -> List[str]:
491
+ """Explicit SG names plus {prefix}{suffix} for each client CDK_PREFIX."""
492
+ names: List[str] = list(ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_LIST)
493
+ for prefix in ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES_LIST:
494
+ names.append(f"{prefix}{ECS_SERVICE_CONNECT_CLIENT_SG_NAME_SUFFIX}")
495
+ deduped: List[str] = []
496
+ seen = set()
497
+ for name in names:
498
+ if name and name not in seen:
499
+ seen.add(name)
500
+ deduped.append(name)
501
+ return deduped
502
+
503
+
504
+ ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES_TO_LOOKUP = (
505
+ build_service_connect_client_security_group_names()
506
+ )
507
+
508
+ if ENABLE_ECS_SERVICE_CONNECT == "True" and USE_ECS_EXPRESS_MODE == "True":
509
+ raise ValueError(
510
+ "ENABLE_ECS_SERVICE_CONNECT=True is only supported on the legacy Fargate "
511
+ "service path. Set USE_ECS_EXPRESS_MODE=False or disable Service Connect."
512
+ )
513
+
514
+ # S3-uploaded job .env files trigger one-shot ECS Fargate tasks (direct mode / cli_redact).
515
+ ENABLE_S3_BATCH_ECS_TRIGGER = get_or_create_env_var(
516
+ "ENABLE_S3_BATCH_ECS_TRIGGER", "False"
517
+ )
518
+ S3_BATCH_ENV_PREFIX = get_or_create_env_var("S3_BATCH_ENV_PREFIX", "input/config/")
519
+ S3_BATCH_ENV_SUFFIX = get_or_create_env_var("S3_BATCH_ENV_SUFFIX", ".env")
520
+ S3_BATCH_INPUT_PREFIX = get_or_create_env_var("S3_BATCH_INPUT_PREFIX", "input/")
521
+ S3_BATCH_CONFIG_PREFIX = get_or_create_env_var("S3_BATCH_CONFIG_PREFIX", "")
522
+ S3_BATCH_DEFAULT_PARAMS_KEY = get_or_create_env_var(
523
+ "S3_BATCH_DEFAULT_PARAMS_KEY", "general-config/batch_defaults.env"
524
+ )
525
+ S3_BATCH_LAMBDA_FUNCTION_NAME = get_or_create_env_var(
526
+ "S3_BATCH_LAMBDA_FUNCTION_NAME", ""
527
+ )
528
+
529
+ if ENABLE_S3_BATCH_ECS_TRIGGER == "True" and USE_ECS_EXPRESS_MODE == "True":
530
+ raise ValueError(
531
+ "ENABLE_S3_BATCH_ECS_TRIGGER=True requires the legacy Fargate task definition "
532
+ "for ecs.run_task. Set USE_ECS_EXPRESS_MODE=False or disable the batch trigger."
533
+ )
534
+
535
+ # Pi agent Gradio UI (second Fargate service; shared legacy ALB + Service Connect to main app).
536
+ ENABLE_PI_AGENT_ECS_SERVICE = get_or_create_env_var(
537
+ "ENABLE_PI_AGENT_ECS_SERVICE", "False"
538
+ )
539
+ ECR_PI_REPO_NAME = get_or_create_env_var(
540
+ "ECR_PI_REPO_NAME", f"{CDK_PREFIX}pi-agent".lower()
541
+ )
542
+ CODEBUILD_PI_PROJECT_NAME = get_or_create_env_var(
543
+ "CODEBUILD_PI_PROJECT_NAME", f"{CDK_PREFIX}CodeBuildPiAgent"
544
+ )
545
+ ECS_PI_SERVICE_NAME = get_or_create_env_var(
546
+ "ECS_PI_SERVICE_NAME", f"{CDK_PREFIX}PiAgentService"
547
+ )
548
+ ECS_PI_TASK_DEFINITION_NAME = get_or_create_env_var(
549
+ "ECS_PI_TASK_DEFINITION_NAME", f"{CDK_PREFIX}PiAgentTaskDefinition"
550
+ )
551
+ ECS_PI_SECURITY_GROUP_NAME = get_or_create_env_var(
552
+ "ECS_PI_SECURITY_GROUP_NAME", f"{CDK_PREFIX}SecurityGroupPiAgent"
553
+ )
554
+ ECS_PI_LOG_GROUP_NAME = get_or_create_env_var(
555
+ "ECS_PI_LOG_GROUP_NAME", f"/ecs/{ECS_PI_SERVICE_NAME}-logs".lower()
556
+ )
557
+ ECS_PI_TASK_CPU_SIZE = get_or_create_env_var("ECS_PI_TASK_CPU_SIZE", "1024")
558
+ ECS_PI_TASK_MEMORY_SIZE = get_or_create_env_var("ECS_PI_TASK_MEMORY_SIZE", "2048")
559
+ PI_GRADIO_PORT = get_or_create_env_var("PI_GRADIO_PORT", "7862")
560
+ PI_ALB_HOST_HEADER = get_or_create_env_var("PI_ALB_HOST_HEADER", "")
561
+ PI_ALB_TARGET_GROUP_NAME = get_or_create_env_var(
562
+ "PI_ALB_TARGET_GROUP_NAME", f"{CDK_PREFIX}PiAgentTG"[-32:]
563
+ )
564
+ PI_ALB_LISTENER_RULE_PRIORITY = int(
565
+ get_or_create_env_var("PI_ALB_LISTENER_RULE_PRIORITY", "1")
566
+ )
567
+ PI_AGENT_ENV_S3_KEY = get_or_create_env_var("PI_AGENT_ENV_S3_KEY", "pi_agent.env")
568
+
569
+ if ENABLE_PI_AGENT_ECS_SERVICE == "True" and USE_ECS_EXPRESS_MODE == "True":
570
+ raise ValueError(
571
+ "ENABLE_PI_AGENT_ECS_SERVICE=True requires legacy Fargate (USE_ECS_EXPRESS_MODE=False)."
572
+ )
573
+ if ENABLE_PI_AGENT_ECS_SERVICE == "True" and ENABLE_ECS_SERVICE_CONNECT != "True":
574
+ raise ValueError(
575
+ "ENABLE_PI_AGENT_ECS_SERVICE=True requires ENABLE_ECS_SERVICE_CONNECT=True "
576
+ "so the Pi task can reach the main app at http://<discovery>:7860."
577
+ )
578
+ if ENABLE_PI_AGENT_ECS_SERVICE == "True" and not PI_ALB_HOST_HEADER.strip():
579
+ raise ValueError(
580
+ "ENABLE_PI_AGENT_ECS_SERVICE=True requires PI_ALB_HOST_HEADER "
581
+ "(host-header rule on the shared ALB, e.g. pi.redaction.example.com)."
582
+ )
583
+
584
+ ###
585
+ # WHOLE DOCUMENT API OPTIONS
586
+ ###
587
+
588
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var(
589
+ "DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS", "7"
590
+ ) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
cdk/cdk_functions.py ADDED
@@ -0,0 +1,2448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ipaddress
2
+ import json
3
+ import os
4
+ from typing import Any, Dict, FrozenSet, List, Optional, Tuple, Union
5
+
6
+ import boto3
7
+ import pandas as pd
8
+ from aws_cdk import App, CfnOutput, CfnTag, Duration, Fn, RemovalPolicy, Tags
9
+ from aws_cdk import aws_cognito as cognito
10
+ from aws_cdk import aws_ec2 as ec2
11
+ from aws_cdk import aws_ecs as ecs
12
+ from aws_cdk import aws_elasticloadbalancingv2 as elb
13
+ from aws_cdk import aws_elasticloadbalancingv2_actions as elb_act
14
+ from aws_cdk import aws_iam as iam
15
+ from aws_cdk import aws_lambda as lambda_
16
+ from aws_cdk import aws_logs as logs
17
+ from aws_cdk import aws_s3 as s3
18
+ from aws_cdk import aws_s3_notifications as s3n
19
+ from aws_cdk import aws_secretsmanager as secretsmanager
20
+ from aws_cdk import aws_wafv2 as wafv2
21
+ from aws_cdk import custom_resources as cr
22
+ from botocore.exceptions import ClientError, NoCredentialsError
23
+ from cdk_config import (
24
+ ACCESS_LOG_DYNAMODB_TABLE_NAME,
25
+ AWS_REGION,
26
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
27
+ NAT_GATEWAY_EIP_NAME,
28
+ POLICY_FILE_LOCATIONS,
29
+ PRIVATE_SUBNET_AVAILABILITY_ZONES,
30
+ PRIVATE_SUBNET_CIDR_BLOCKS,
31
+ PRIVATE_SUBNETS_TO_USE,
32
+ PUBLIC_SUBNET_AVAILABILITY_ZONES,
33
+ PUBLIC_SUBNET_CIDR_BLOCKS,
34
+ PUBLIC_SUBNETS_TO_USE,
35
+ S3_LOG_CONFIG_BUCKET_NAME,
36
+ S3_OUTPUT_BUCKET_NAME,
37
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
38
+ )
39
+ from constructs import Construct
40
+ from dotenv import dotenv_values, set_key
41
+
42
+ # CDK CLI stores lookup-provider results under these key prefixes in cdk.context.json.
43
+ _CDK_LOOKUP_CONTEXT_PREFIXES = (
44
+ "vpc-provider:",
45
+ "load-balancer:",
46
+ "availability-zones:",
47
+ "hosted-zone:",
48
+ "security-group:",
49
+ "key-provider:",
50
+ "ami:",
51
+ )
52
+
53
+
54
+ def purge_cdk_lookup_context(file_path: str) -> int:
55
+ """Remove stale CDK lookup cache entries that require the bootstrap lookup role."""
56
+ if not os.path.exists(file_path):
57
+ return 0
58
+ with open(file_path, "r", encoding="utf-8") as f:
59
+ context_data = json.load(f)
60
+ cleaned = {
61
+ key: value
62
+ for key, value in context_data.items()
63
+ if not key.startswith(_CDK_LOOKUP_CONTEXT_PREFIXES)
64
+ }
65
+ removed = len(context_data) - len(cleaned)
66
+ if removed:
67
+ with open(file_path, "w", encoding="utf-8") as f:
68
+ json.dump(cleaned, f, indent=2)
69
+ print(f"Removed {removed} stale CDK lookup context key(s) from {file_path}.")
70
+ return removed
71
+
72
+
73
+ def log_aws_credential_context(
74
+ expected_account_id: Optional[str] = None,
75
+ expected_region: Optional[str] = None,
76
+ ) -> Dict[str, Any]:
77
+ """
78
+ Print the active AWS identity and non-secret credential hints for CDK debugging.
79
+
80
+ Helps distinguish SSO/assumed-role sessions from long-lived access keys in
81
+ ~/.aws/credentials or environment variables.
82
+ """
83
+ profile = os.environ.get("AWS_PROFILE") or "(not set — using default profile chain)"
84
+ default_region = (
85
+ os.environ.get("AWS_REGION")
86
+ or os.environ.get("AWS_DEFAULT_REGION")
87
+ or "(not set in environment)"
88
+ )
89
+ env_access_key_set = bool(os.environ.get("AWS_ACCESS_KEY_ID"))
90
+ env_secret_key_set = bool(os.environ.get("AWS_SECRET_ACCESS_KEY"))
91
+ env_session_token_set = bool(os.environ.get("AWS_SESSION_TOKEN"))
92
+
93
+ print("\n--- AWS credential context (CDK / boto3) ---")
94
+ print(f"AWS_PROFILE: {profile}")
95
+ print(f"AWS_REGION / AWS_DEFAULT_REGION (env): {default_region}")
96
+ print(
97
+ "Environment credential variables: "
98
+ f"AWS_ACCESS_KEY_ID={'set' if env_access_key_set else 'not set'}, "
99
+ f"AWS_SECRET_ACCESS_KEY={'set' if env_secret_key_set else 'not set'}, "
100
+ f"AWS_SESSION_TOKEN={'set' if env_session_token_set else 'not set'}"
101
+ )
102
+ if expected_account_id:
103
+ print(f"Configured CDK target account (AWS_ACCOUNT_ID): {expected_account_id}")
104
+ if expected_region:
105
+ print(f"Configured CDK target region (AWS_REGION): {expected_region}")
106
+
107
+ session = boto3.Session()
108
+ active_profile = session.profile_name or "(default)"
109
+ print(f"boto3 session profile: {active_profile}")
110
+ print(f"boto3 session region: {session.region_name or '(not set)'}")
111
+
112
+ credentials = session.get_credentials()
113
+ credential_summary: Dict[str, Any] = {
114
+ "profile": profile,
115
+ "session_profile": active_profile,
116
+ }
117
+
118
+ if credentials is None:
119
+ print("WARNING: No AWS credentials found in the default provider chain.")
120
+ print("--- End AWS credential context ---\n")
121
+ credential_summary["error"] = "no_credentials"
122
+ return credential_summary
123
+
124
+ frozen = credentials.get_frozen_credentials()
125
+ access_key = frozen.access_key or ""
126
+ access_key_prefix = (access_key[:4] + "...") if len(access_key) >= 4 else "(none)"
127
+ credential_summary["access_key_prefix"] = access_key_prefix
128
+
129
+ if env_access_key_set:
130
+ credential_source = "environment variables (highest precedence)"
131
+ elif access_key.startswith("AKIA"):
132
+ credential_source = "long-lived access key (likely ~/.aws/credentials [default] or named profile)"
133
+ elif access_key.startswith("ASIA"):
134
+ credential_source = "temporary credentials (SSO, assumed role, or STS session)"
135
+ else:
136
+ credential_source = (
137
+ "resolved credentials (source could not be classified from key prefix)"
138
+ )
139
+
140
+ print(f"Inferred credential type: {credential_source}")
141
+ credential_summary["inferred_credential_type"] = credential_source
142
+
143
+ if env_access_key_set and profile != "(not set — using default profile chain)":
144
+ print(
145
+ "NOTE: AWS_ACCESS_KEY_ID is set in the environment, so it overrides "
146
+ f"profile '{profile}' and SSO."
147
+ )
148
+
149
+ try:
150
+ sts = session.client("sts", region_name=session.region_name or expected_region)
151
+ identity = sts.get_caller_identity()
152
+ except (ClientError, NoCredentialsError) as exc:
153
+ print(f"WARNING: sts:GetCallerIdentity failed: {exc}")
154
+ print("--- End AWS credential context ---\n")
155
+ credential_summary["error"] = str(exc)
156
+ return credential_summary
157
+
158
+ account = identity.get("Account", "")
159
+ arn = identity.get("Arn", "")
160
+ user_id = identity.get("UserId", "")
161
+
162
+ print(f"Caller account: {account}")
163
+ print(f"Caller ARN: {arn}")
164
+ print(f"Caller UserId: {user_id}")
165
+
166
+ if ":assumed-role/" in arn:
167
+ principal_kind = "assumed IAM role (typical for SSO or role chaining)"
168
+ elif ":user/" in arn:
169
+ principal_kind = "IAM user (typical for static access keys in credentials file)"
170
+ elif ":federated-user/" in arn:
171
+ principal_kind = "federated user"
172
+ else:
173
+ principal_kind = "other IAM principal"
174
+
175
+ print(f"Principal kind: {principal_kind}")
176
+ credential_summary.update(
177
+ {
178
+ "account": account,
179
+ "arn": arn,
180
+ "user_id": user_id,
181
+ "principal_kind": principal_kind,
182
+ }
183
+ )
184
+
185
+ if expected_account_id and account and account != str(expected_account_id):
186
+ print(
187
+ "WARNING: Caller account does not match configured AWS_ACCOUNT_ID. "
188
+ "CDK will target the configured account but act as this identity — "
189
+ "deployments and lookups may fail. Set AWS_PROFILE to your SSO profile "
190
+ "and unset AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY if needed."
191
+ )
192
+ credential_summary["account_mismatch"] = True
193
+ elif expected_account_id and account == str(expected_account_id):
194
+ print("Caller account matches configured AWS_ACCOUNT_ID.")
195
+
196
+ if profile == "(not set — using default profile chain)":
197
+ print(
198
+ "TIP: Set AWS_PROFILE to your SSO profile name so Python and the CDK CLI "
199
+ "(Node) use the same session. Example: "
200
+ '$env:AWS_PROFILE = "YourSsoProfileName"'
201
+ )
202
+
203
+ print("--- End AWS credential context ---\n")
204
+ return credential_summary
205
+
206
+
207
+ # --- Function to load context from file ---
208
+ def load_context_from_file(app: App, file_path: str):
209
+ if os.path.exists(file_path):
210
+ with open(file_path, "r", encoding="utf-8") as f:
211
+ context_data = json.load(f)
212
+ for key, value in context_data.items():
213
+ app.node.set_context(key, value)
214
+ print(f"Loaded context from {file_path}")
215
+ else:
216
+ print(f"Context file not found: {file_path}")
217
+
218
+
219
+ # --- Helper to parse environment variables into lists ---
220
+ def _get_env_list(env_var_name: str) -> List[str]:
221
+ """Parses a comma-separated environment variable into a list of strings."""
222
+ value = env_var_name[1:-1].strip().replace('"', "").replace("'", "")
223
+ if not value:
224
+ return []
225
+ # Split by comma and filter out any empty strings that might result from extra commas
226
+ return [s.strip() for s in value.split(",") if s.strip()]
227
+
228
+
229
+ # 1. Try to load CIDR/AZs from environment variables
230
+ if PUBLIC_SUBNETS_TO_USE:
231
+ PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
232
+ if PRIVATE_SUBNETS_TO_USE:
233
+ PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
234
+
235
+ if PUBLIC_SUBNET_CIDR_BLOCKS:
236
+ PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
237
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES:
238
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
239
+ if PRIVATE_SUBNET_CIDR_BLOCKS:
240
+ PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
241
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES:
242
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(
243
+ "PRIVATE_SUBNET_AVAILABILITY_ZONES"
244
+ )
245
+
246
+ if POLICY_FILE_LOCATIONS:
247
+ POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
248
+
249
+
250
+ def check_for_existing_role(role_name: str):
251
+ try:
252
+ iam = boto3.client("iam")
253
+ # iam.get_role(RoleName=role_name)
254
+
255
+ response = iam.get_role(RoleName=role_name)
256
+ role = response["Role"]["Arn"]
257
+
258
+ print("Response Role:", role)
259
+
260
+ return True, role, ""
261
+ except iam.exceptions.NoSuchEntityException:
262
+ return False, "", ""
263
+ except Exception as e:
264
+ raise Exception("Getting information on IAM role failed due to:", e)
265
+
266
+
267
+ from typing import List
268
+
269
+ # Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
270
+ # For example:
271
+ # POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
272
+
273
+
274
+ def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
275
+ """
276
+ Adds individual policy statements from a parsed policy document to a CDK Role.
277
+
278
+ Args:
279
+ role: The CDK Role construct to attach policies to.
280
+ policy_document: A Python dictionary representing an IAM policy document.
281
+ """
282
+ # Ensure the loaded JSON is a valid policy document structure
283
+ if "Statement" not in policy_document or not isinstance(
284
+ policy_document["Statement"], list
285
+ ):
286
+ print("Warning: Policy document does not contain a 'Statement' list. Skipping.")
287
+ return # Do not return role, just log and exit
288
+
289
+ for statement_dict in policy_document["Statement"]:
290
+ try:
291
+ # Create a CDK PolicyStatement from the dictionary
292
+ cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
293
+
294
+ # Add the policy statement to the role
295
+ role.add_to_policy(cdk_policy_statement)
296
+ print(f" - Added statement: {statement_dict.get('Sid', 'No Sid')}")
297
+ except Exception as e:
298
+ print(
299
+ f"Warning: Could not process policy statement: {statement_dict}. Error: {e}"
300
+ )
301
+
302
+
303
+ def add_s3_enforce_ssl_policy(bucket: s3.IBucket) -> None:
304
+ """Deny non-TLS S3 requests (Security Hub S3.5). Compatible with all CDK versions."""
305
+ bucket.add_to_resource_policy(
306
+ iam.PolicyStatement(
307
+ effect=iam.Effect.DENY,
308
+ principals=[iam.AnyPrincipal()],
309
+ actions=["s3:*"],
310
+ resources=[bucket.bucket_arn, f"{bucket.bucket_arn}/*"],
311
+ conditions={"Bool": {"aws:SecureTransport": "false"}},
312
+ )
313
+ )
314
+
315
+
316
+ def add_custom_policies(
317
+ scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
318
+ role: iam.IRole,
319
+ policy_file_locations: Optional[List[str]] = None,
320
+ custom_policy_text: Optional[str] = None,
321
+ ) -> iam.IRole:
322
+ """
323
+ Loads custom policies from JSON files or a string and attaches them to a CDK Role.
324
+
325
+ Args:
326
+ scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
327
+ role: The CDK Role construct to attach policies to.
328
+ policy_file_locations: List of file paths to JSON policy documents.
329
+ custom_policy_text: A JSON string representing a policy document.
330
+
331
+ Returns:
332
+ The modified CDK Role construct.
333
+ """
334
+ if policy_file_locations is None:
335
+ policy_file_locations = []
336
+
337
+ current_source = "unknown source" # For error messages
338
+
339
+ try:
340
+ if policy_file_locations:
341
+ print(f"Attempting to add policies from files to role {role.node.id}...")
342
+ for path in policy_file_locations:
343
+ current_source = f"file: {path}"
344
+ try:
345
+ with open(path, "r") as f:
346
+ policy_document = json.load(f)
347
+ print(f"Processing policy from {current_source}...")
348
+ add_statement_to_policy(role, policy_document)
349
+ except FileNotFoundError:
350
+ print(f"Warning: Policy file not found at {path}. Skipping.")
351
+ except json.JSONDecodeError as e:
352
+ print(
353
+ f"Warning: Invalid JSON in policy file {path}: {e}. Skipping."
354
+ )
355
+ except Exception as e:
356
+ print(
357
+ f"An unexpected error occurred processing policy from {path}: {e}. Skipping."
358
+ )
359
+
360
+ if custom_policy_text:
361
+ current_source = "custom policy text string"
362
+ print(
363
+ f"Attempting to add policy from custom text to role {role.node.id}..."
364
+ )
365
+ try:
366
+ # *** FIX: Parse the JSON string into a Python dictionary ***
367
+ policy_document = json.loads(custom_policy_text)
368
+ print(f"Processing policy from {current_source}...")
369
+ add_statement_to_policy(role, policy_document)
370
+ except json.JSONDecodeError as e:
371
+ print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
372
+ except Exception as e:
373
+ print(
374
+ f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping."
375
+ )
376
+
377
+ # You might want a final success message, but individual processing messages are also good.
378
+ print(f"Finished processing custom policies for role {role.node.id}.")
379
+
380
+ except Exception as e:
381
+ print(
382
+ f"An unhandled error occurred during policy addition for {current_source}: {e}"
383
+ )
384
+
385
+ return role
386
+
387
+
388
+ # Import the S3 Bucket class if you intend to return a CDK object later
389
+ # from aws_cdk import aws_s3 as s3
390
+
391
+
392
+ def check_s3_bucket_exists(
393
+ bucket_name: str,
394
+ ): # Return type hint depends on what you return
395
+ """
396
+ Checks if an S3 bucket with the given name exists and is accessible.
397
+
398
+ Args:
399
+ bucket_name: The name of the S3 bucket to check.
400
+
401
+ Returns:
402
+ A tuple: (bool indicating existence, optional S3 Bucket object or None)
403
+ Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
404
+ for direct use in CDK. You'll likely only need the boolean result
405
+ or the bucket name for CDK lookups/creations.
406
+ For this example, let's return the boolean and the name.
407
+ """
408
+ s3_client = boto3.client("s3")
409
+ try:
410
+ # Use head_bucket to check for existence and access
411
+ s3_client.head_bucket(Bucket=bucket_name)
412
+ print(f"Bucket '{bucket_name}' exists and is accessible.")
413
+ return True, bucket_name # Return True and the bucket name
414
+
415
+ except ClientError as e:
416
+ # If a ClientError occurs, check the error code.
417
+ # '404' means the bucket does not exist.
418
+ # '403' means the bucket exists but you don't have permission.
419
+ error_code = e.response["Error"]["Code"]
420
+ if error_code == "404":
421
+ print(f"Bucket '{bucket_name}' does not exist.")
422
+ return False, None
423
+ elif error_code == "403":
424
+ # The bucket exists, but you can't access it.
425
+ # Depending on your requirements, this might be treated as "exists"
426
+ # or "not accessible for our purpose". For checking existence,
427
+ # we'll say it exists here, but note the permission issue.
428
+ # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
429
+ print(
430
+ f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case."
431
+ )
432
+ return False, bucket_name # It exists, even if not accessible
433
+ else:
434
+ # For other errors, it's better to raise the exception
435
+ # to indicate something unexpected happened.
436
+ print(
437
+ f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}"
438
+ )
439
+ # Decide how to handle other errors - raising might be safer
440
+ raise # Re-raise the original exception
441
+ except Exception as e:
442
+ print(
443
+ f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}"
444
+ )
445
+ # Decide how to handle other errors
446
+ raise # Re-raise the original exception
447
+
448
+
449
+ # Example usage in your check_resources.py:
450
+ # exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
451
+ # context_data[f"exists:{log_bucket_name}"] = exists
452
+ # # You don't necessarily need to store the name in context if using from_bucket_name
453
+
454
+
455
+ # Delete an S3 bucket
456
+ def delete_s3_bucket(bucket_name: str):
457
+ s3 = boto3.client("s3")
458
+
459
+ try:
460
+ # List and delete all objects
461
+ response = s3.list_object_versions(Bucket=bucket_name)
462
+ versions = response.get("Versions", []) + response.get("DeleteMarkers", [])
463
+ for version in versions:
464
+ s3.delete_object(
465
+ Bucket=bucket_name, Key=version["Key"], VersionId=version["VersionId"]
466
+ )
467
+
468
+ # Delete the bucket
469
+ s3.delete_bucket(Bucket=bucket_name)
470
+ return {"Status": "SUCCESS"}
471
+ except Exception as e:
472
+ return {"Status": "FAILED", "Reason": str(e)}
473
+
474
+
475
+ # Function to get subnet ID from subnet name
476
+ def get_subnet_id(vpc: str, ec2_client: str, subnet_name: str):
477
+ response = ec2_client.describe_subnets(
478
+ Filters=[{"Name": "vpc-id", "Values": [vpc.vpc_id]}]
479
+ )
480
+
481
+ for subnet in response["Subnets"]:
482
+ if subnet["Tags"] and any(
483
+ tag["Key"] == "Name" and tag["Value"] == subnet_name
484
+ for tag in subnet["Tags"]
485
+ ):
486
+ return subnet["SubnetId"]
487
+
488
+ return None
489
+
490
+
491
+ def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
492
+ """
493
+ Checks if an ECR repository with the given name exists.
494
+
495
+ Args:
496
+ repo_name: The name of the ECR repository to check.
497
+
498
+ Returns:
499
+ True if the repository exists, False otherwise.
500
+ """
501
+ ecr_client = boto3.client("ecr")
502
+ try:
503
+ print("ecr repo_name to check:", repo_name)
504
+ response = ecr_client.describe_repositories(repositoryNames=[repo_name])
505
+ # If describe_repositories succeeds and returns a list of repositories,
506
+ # and the list is not empty, the repository exists.
507
+ return len(response["repositories"]) > 0, response["repositories"][0]
508
+ except ClientError as e:
509
+ # Check for the specific error code indicating the repository doesn't exist
510
+ if e.response["Error"]["Code"] == "RepositoryNotFoundException":
511
+ return False, {}
512
+ else:
513
+ # Re-raise other exceptions to handle unexpected errors
514
+ raise
515
+ except Exception as e:
516
+ print(f"An unexpected error occurred: {e}")
517
+ return False, {}
518
+
519
+
520
+ def check_codebuild_project_exists(
521
+ project_name: str,
522
+ ): # Adjust return type hint as needed
523
+ """
524
+ Checks if a CodeBuild project with the given name exists.
525
+
526
+ Args:
527
+ project_name: The name of the CodeBuild project to check.
528
+
529
+ Returns:
530
+ A tuple:
531
+ - The first element is True if the project exists, False otherwise.
532
+ - The second element is the project object (dictionary) if found,
533
+ None otherwise.
534
+ """
535
+ codebuild_client = boto3.client("codebuild")
536
+ try:
537
+ # Use batch_get_projects with a list containing the single project name
538
+ response = codebuild_client.batch_get_projects(names=[project_name])
539
+
540
+ # The response for batch_get_projects includes 'projects' (found)
541
+ # and 'projectsNotFound' (not found).
542
+ if response["projects"]:
543
+ # If the project is found in the 'projects' list
544
+ print(f"CodeBuild project '{project_name}' found.")
545
+ project = response["projects"][0]
546
+ return (
547
+ True,
548
+ project["arn"],
549
+ project.get("serviceRole"),
550
+ )
551
+ elif (
552
+ response["projectsNotFound"]
553
+ and project_name in response["projectsNotFound"]
554
+ ):
555
+ # If the project name is explicitly in the 'projectsNotFound' list
556
+ print(f"CodeBuild project '{project_name}' not found.")
557
+ return False, None, None
558
+ else:
559
+ # This case is less expected for a single name lookup,
560
+ # but could happen if there's an internal issue or the response
561
+ # structure is slightly different than expected for an error.
562
+ # It's safer to assume it wasn't found if not in 'projects'.
563
+ print(
564
+ f"CodeBuild project '{project_name}' not found (not in 'projects' list)."
565
+ )
566
+ return False, None, None
567
+
568
+ except ClientError as e:
569
+ # Catch specific ClientErrors. batch_get_projects might not throw
570
+ # 'InvalidInputException' for a non-existent project name if the
571
+ # name format is valid. It typically just lists it in projectsNotFound.
572
+ # However, other ClientErrors are possible (e.g., permissions).
573
+ print(
574
+ f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}"
575
+ )
576
+ # Decide how to handle other ClientErrors - raising might be safer
577
+ raise # Re-raise the original exception
578
+ except Exception as e:
579
+ print(
580
+ f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}"
581
+ )
582
+ # Decide how to handle other errors
583
+ raise # Re-raise the original exception
584
+
585
+
586
+ def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
587
+ """
588
+ Finds a VPC ID by its 'Name' tag.
589
+ """
590
+ ec2_client = boto3.client("ec2")
591
+ try:
592
+ response = ec2_client.describe_vpcs(
593
+ Filters=[{"Name": "tag:Name", "Values": [vpc_name]}]
594
+ )
595
+ if response and response["Vpcs"]:
596
+ vpc_id = response["Vpcs"][0]["VpcId"]
597
+ print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
598
+
599
+ # In get_vpc_id_by_name, after finding VPC ID:
600
+
601
+ # Look for NAT Gateways in this VPC
602
+ ec2_client = boto3.client("ec2")
603
+ nat_gateways = []
604
+ try:
605
+ response = ec2_client.describe_nat_gateways(
606
+ Filters=[
607
+ {"Name": "vpc-id", "Values": [vpc_id]},
608
+ # Optional: Add a tag filter if you consistently tag your NATs
609
+ # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
610
+ ]
611
+ )
612
+ nat_gateways = response.get("NatGateways", [])
613
+ except Exception as e:
614
+ print(
615
+ f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}"
616
+ )
617
+ # Decide how to handle this error - proceed or raise?
618
+
619
+ # Decide how to identify the specific NAT Gateway you want to check for.
620
+
621
+ return vpc_id, nat_gateways
622
+ else:
623
+ print(f"VPC '{vpc_name}' not found.")
624
+ return None
625
+ except Exception as e:
626
+ print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
627
+ raise
628
+
629
+
630
+ # --- Helper to fetch all existing subnets in a VPC once ---
631
+ def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
632
+ """
633
+ Fetches all subnets in a given VPC.
634
+ Returns a dictionary with 'by_name' (map of name to subnet data),
635
+ 'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
636
+ """
637
+ ec2_client = boto3.client("ec2")
638
+ existing_subnets_data = {
639
+ "by_name": {}, # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
640
+ "by_id": {}, # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x/x'}}
641
+ "cidr_networks": [], # List of ipaddress.IPv4Network objects
642
+ }
643
+ try:
644
+ subnet_to_route_table: Dict[str, str] = {}
645
+ rt_response = ec2_client.describe_route_tables(
646
+ Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
647
+ )
648
+ for route_table in rt_response.get("RouteTables", []):
649
+ route_table_id = route_table["RouteTableId"]
650
+ for association in route_table.get("Associations", []):
651
+ associated_subnet_id = association.get("SubnetId")
652
+ if associated_subnet_id:
653
+ subnet_to_route_table[associated_subnet_id] = route_table_id
654
+
655
+ response = ec2_client.describe_subnets(
656
+ Filters=[{"Name": "vpc-id", "Values": [vpc_id]}]
657
+ )
658
+ for s in response.get("Subnets", []):
659
+ subnet_id = s["SubnetId"]
660
+ cidr_block = s.get("CidrBlock")
661
+ # Extract 'Name' tag, which is crucial for lookup by name
662
+ name_tag = next(
663
+ (tag["Value"] for tag in s.get("Tags", []) if tag["Key"] == "Name"),
664
+ None,
665
+ )
666
+
667
+ subnet_info = {
668
+ "id": subnet_id,
669
+ "cidr": cidr_block,
670
+ "name": name_tag,
671
+ "az": s.get("AvailabilityZone"),
672
+ "route_table_id": subnet_to_route_table.get(subnet_id),
673
+ }
674
+
675
+ if name_tag:
676
+ existing_subnets_data["by_name"][name_tag] = subnet_info
677
+ existing_subnets_data["by_id"][subnet_id] = subnet_info
678
+
679
+ if cidr_block:
680
+ try:
681
+ existing_subnets_data["cidr_networks"].append(
682
+ ipaddress.ip_network(cidr_block, strict=False)
683
+ )
684
+ except ValueError:
685
+ print(
686
+ f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check."
687
+ )
688
+
689
+ print(
690
+ f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'."
691
+ )
692
+ except Exception as e:
693
+ print(
694
+ f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation."
695
+ )
696
+ raise # Re-raise if this essential step fails
697
+
698
+ return existing_subnets_data
699
+
700
+
701
+ # --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
702
+ def validate_subnet_creation_parameters(
703
+ vpc_id: str,
704
+ proposed_subnets_data: List[
705
+ Dict[str, str]
706
+ ], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
707
+ existing_aws_subnets_data: Dict[
708
+ str, Any
709
+ ], # Pre-fetched data from _get_existing_subnets_in_vpc
710
+ ) -> None:
711
+ """
712
+ Validates proposed subnet names and CIDR blocks against existing AWS subnets
713
+ in the specified VPC and against each other.
714
+ This function uses pre-fetched AWS subnet data.
715
+
716
+ Args:
717
+ vpc_id: The ID of the VPC (for logging/error messages).
718
+ proposed_subnets_data: A list of dictionaries, where each dict represents
719
+ a proposed subnet with 'name', 'cidr', and 'az'.
720
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
721
+ (e.g., from _get_existing_subnets_in_vpc).
722
+
723
+ Raises:
724
+ ValueError: If any proposed subnet name or CIDR block
725
+ conflicts with existing AWS resources or other proposed resources.
726
+ """
727
+ if not proposed_subnets_data:
728
+ print("No proposed subnet data provided for validation. Skipping.")
729
+ return
730
+
731
+ print(
732
+ f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---"
733
+ )
734
+
735
+ print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data["by_name"]))
736
+
737
+ existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
738
+ existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
739
+
740
+ # Sets to track names and list to track networks for internal batch consistency
741
+ proposed_names_seen: set[str] = set()
742
+ proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
743
+
744
+ for i, proposed_subnet in enumerate(proposed_subnets_data):
745
+ subnet_name = proposed_subnet.get("name")
746
+ cidr_block_str = proposed_subnet.get("cidr")
747
+ availability_zone = proposed_subnet.get("az")
748
+
749
+ if not all([subnet_name, cidr_block_str, availability_zone]):
750
+ raise ValueError(
751
+ f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'."
752
+ )
753
+
754
+ # 1. Check for duplicate names within the proposed batch
755
+ if subnet_name in proposed_names_seen:
756
+ raise ValueError(
757
+ f"Proposed subnet name '{subnet_name}' is duplicated within the input list."
758
+ )
759
+ proposed_names_seen.add(subnet_name)
760
+
761
+ # 2. Check for duplicate names against existing AWS subnets
762
+ if subnet_name in existing_aws_subnet_names:
763
+ print(
764
+ f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'."
765
+ )
766
+
767
+ # Parse proposed CIDR
768
+ try:
769
+ proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
770
+ except ValueError as e:
771
+ raise ValueError(
772
+ f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}"
773
+ )
774
+
775
+ # 3. Check for overlapping CIDRs within the proposed batch
776
+ for existing_proposed_net in proposed_cidr_networks_seen:
777
+ if proposed_net.overlaps(existing_proposed_net):
778
+ raise ValueError(
779
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
780
+ f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
781
+ f"within the same batch."
782
+ )
783
+
784
+ # 4. Check for overlapping CIDRs against existing AWS subnets
785
+ for existing_aws_net in existing_aws_cidr_networks:
786
+ if proposed_net.overlaps(existing_aws_net):
787
+ raise ValueError(
788
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
789
+ f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
790
+ f"in VPC '{vpc_id}'."
791
+ )
792
+
793
+ # If all checks pass for this subnet, add its network to the list for subsequent checks
794
+ proposed_cidr_networks_seen.append(proposed_net)
795
+ print(
796
+ f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'."
797
+ )
798
+
799
+ print(
800
+ f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---"
801
+ )
802
+
803
+
804
+ # --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
805
+ def check_subnet_exists_by_name(
806
+ subnet_name: str, existing_aws_subnets_data: Dict[str, Any]
807
+ ) -> Tuple[bool, Optional[str]]:
808
+ """
809
+ Checks if a subnet with the given name exists within the pre-fetched data.
810
+
811
+ Args:
812
+ subnet_name: The 'Name' tag value of the subnet to check.
813
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
814
+ (e.g., from _get_existing_subnets_in_vpc).
815
+
816
+ Returns:
817
+ A tuple:
818
+ - The first element is True if the subnet exists, False otherwise.
819
+ - The second element is the Subnet ID if found, None otherwise.
820
+ """
821
+ subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
822
+ if subnet_info:
823
+ print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
824
+ return True, subnet_info["id"]
825
+ else:
826
+ print(f"Subnet '{subnet_name}' not found.")
827
+ return False, None
828
+
829
+
830
+ def create_nat_gateway(
831
+ scope: Construct,
832
+ public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
833
+ nat_gateway_name: str,
834
+ nat_gateway_id_context_key: str,
835
+ ) -> str:
836
+ """
837
+ Creates a single NAT Gateway in the specified public subnet.
838
+ It does not handle lookup from context; the calling stack should do that.
839
+ Returns the CloudFormation Ref of the NAT Gateway ID.
840
+ """
841
+ print(
842
+ f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'."
843
+ )
844
+
845
+ # Create an Elastic IP for the NAT Gateway
846
+ eip = ec2.CfnEIP(
847
+ scope,
848
+ NAT_GATEWAY_EIP_NAME,
849
+ tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)],
850
+ )
851
+
852
+ # Create the NAT Gateway
853
+ nat_gateway_logical_id = nat_gateway_name.replace("-", "") + "NatGateway"
854
+ nat_gateway = ec2.CfnNatGateway(
855
+ scope,
856
+ nat_gateway_logical_id,
857
+ subnet_id=public_subnet_for_nat.subnet_id, # Associate with the public subnet
858
+ allocation_id=eip.attr_allocation_id, # Associate with the EIP
859
+ tags=[CfnTag(key="Name", value=nat_gateway_name)],
860
+ )
861
+ # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
862
+ nat_gateway.add_dependency(eip)
863
+
864
+ # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
865
+ # This is how you will get the ID to put into cdk.context.json
866
+ CfnOutput(
867
+ scope,
868
+ "SingleNatGatewayIdOutput",
869
+ value=nat_gateway.ref,
870
+ description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
871
+ export_name=f"{scope.stack_name}-NatGatewayId", # Make export name unique
872
+ )
873
+
874
+ print(
875
+ f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment."
876
+ )
877
+ # Return the tokenised reference for use within this synthesis
878
+ return nat_gateway.ref
879
+
880
+
881
+ def create_subnets(
882
+ scope: Construct,
883
+ vpc: ec2.IVpc,
884
+ prefix: str,
885
+ subnet_names: List[str],
886
+ cidr_blocks: List[str],
887
+ availability_zones: List[str],
888
+ is_public: bool,
889
+ internet_gateway_id: Optional[str] = None,
890
+ single_nat_gateway_id: Optional[str] = None,
891
+ ) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
892
+ """
893
+ Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
894
+ for backward compatibility.
895
+ """
896
+ # --- Validations remain the same ---
897
+ if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
898
+ raise ValueError(
899
+ "Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length."
900
+ )
901
+ if is_public and not internet_gateway_id:
902
+ raise ValueError("internet_gateway_id must be provided for public subnets.")
903
+ if not is_public and not single_nat_gateway_id:
904
+ raise ValueError(
905
+ "single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway."
906
+ )
907
+
908
+ # --- We will populate these lists with the L1 objects to return ---
909
+ created_subnets: List[ec2.CfnSubnet] = []
910
+ created_route_tables: List[ec2.CfnRouteTable] = []
911
+
912
+ subnet_type_tag = "public" if is_public else "private"
913
+
914
+ for i, subnet_name in enumerate(subnet_names):
915
+ logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
916
+
917
+ # 1. Create the L2 Subnet (this is the easy part)
918
+ subnet = ec2.Subnet(
919
+ scope,
920
+ logical_id,
921
+ vpc_id=vpc.vpc_id,
922
+ cidr_block=cidr_blocks[i],
923
+ availability_zone=availability_zones[i],
924
+ map_public_ip_on_launch=is_public,
925
+ )
926
+ Tags.of(subnet).add("Name", subnet_name)
927
+ Tags.of(subnet).add("Type", subnet_type_tag)
928
+
929
+ if is_public:
930
+ # The subnet's route_table is automatically created by the L2 Subnet construct
931
+ try:
932
+ subnet.add_route(
933
+ "DefaultInternetRoute", # A logical ID for the CfnRoute resource
934
+ router_id=internet_gateway_id,
935
+ router_type=ec2.RouterType.GATEWAY,
936
+ # destination_cidr_block="0.0.0.0/0" is the default for this method
937
+ )
938
+ except Exception as e:
939
+ print("Could not create IGW route for public subnet due to:", e)
940
+ print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
941
+ else:
942
+ try:
943
+ # Using .add_route() for private subnets as well for consistency
944
+ subnet.add_route(
945
+ "DefaultNatRoute", # A logical ID for the CfnRoute resource
946
+ router_id=single_nat_gateway_id,
947
+ router_type=ec2.RouterType.NAT_GATEWAY,
948
+ )
949
+ except Exception as e:
950
+ print("Could not create NAT gateway route for public subnet due to:", e)
951
+ print(
952
+ f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route."
953
+ )
954
+
955
+ route_table = subnet.route_table
956
+
957
+ created_subnets.append(subnet)
958
+ created_route_tables.append(route_table)
959
+
960
+ return created_subnets, created_route_tables
961
+
962
+
963
+ def ingress_rule_exists(security_group: str, peer: str, port: str):
964
+ for rule in security_group.connections.security_groups:
965
+ if port:
966
+ if rule.peer == peer and rule.connection == port:
967
+ return True
968
+ else:
969
+ if rule.peer == peer:
970
+ return True
971
+ return False
972
+
973
+
974
+ def check_for_existing_user_pool(user_pool_name: str):
975
+ cognito_client = boto3.client("cognito-idp")
976
+ list_pools_response = cognito_client.list_user_pools(
977
+ MaxResults=60
978
+ ) # MaxResults up to 60
979
+
980
+ # ListUserPools might require pagination if you have more than 60 pools
981
+ # This simple example doesn't handle pagination, which could miss your pool
982
+
983
+ existing_user_pool_id = ""
984
+
985
+ for pool in list_pools_response.get("UserPools", []):
986
+ if pool.get("Name") == user_pool_name:
987
+ existing_user_pool_id = pool["Id"]
988
+ print(
989
+ f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}"
990
+ )
991
+ break # Found the one we're looking for
992
+
993
+ if existing_user_pool_id:
994
+ return True, existing_user_pool_id, pool
995
+ else:
996
+ return False, "", ""
997
+
998
+
999
+ def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
1000
+ """
1001
+ Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
1002
+
1003
+ Args:
1004
+ user_pool_id: The ID of the Cognito User Pool.
1005
+ user_pool_client_name: The name of the User Pool Client to check for.
1006
+
1007
+ Returns:
1008
+ A tuple:
1009
+ - True, client_id, client_details if the client exists.
1010
+ - False, "", {} otherwise.
1011
+ """
1012
+ cognito_client = boto3.client("cognito-idp")
1013
+ next_token = "string"
1014
+
1015
+ while True:
1016
+ try:
1017
+ response = cognito_client.list_user_pool_clients(
1018
+ UserPoolId=user_pool_id, MaxResults=60, NextToken=next_token
1019
+ )
1020
+ except cognito_client.exceptions.ResourceNotFoundException:
1021
+ print(f"Error: User pool with ID '{user_pool_id}' not found.")
1022
+ return False, "", {}
1023
+
1024
+ except cognito_client.exceptions.InvalidParameterException:
1025
+ print(f"Error: No app clients for '{user_pool_id}' found.")
1026
+ return False, "", {}
1027
+
1028
+ except Exception as e:
1029
+ print("Could not check User Pool clients due to:", e)
1030
+
1031
+ for client in response.get("UserPoolClients", []):
1032
+ if client.get("ClientName") == user_pool_client_name:
1033
+ print(
1034
+ f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}"
1035
+ )
1036
+ return True, client["ClientId"], client
1037
+
1038
+ next_token = response.get("NextToken")
1039
+ if not next_token:
1040
+ break
1041
+
1042
+ return False, "", {}
1043
+
1044
+
1045
+ def check_for_secret(secret_name: str, secret_value: dict = ""):
1046
+ """
1047
+ Checks if a Secrets Manager secret with the given name exists.
1048
+ If it doesn't exist, it creates the secret.
1049
+
1050
+ Args:
1051
+ secret_name: The name of the Secrets Manager secret.
1052
+ secret_value: A dictionary containing the key-value pairs for the secret.
1053
+
1054
+ Returns:
1055
+ True if the secret existed or was created, False otherwise (due to other errors).
1056
+ """
1057
+ secretsmanager_client = boto3.client("secretsmanager")
1058
+
1059
+ try:
1060
+ # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
1061
+ secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
1062
+ print("Secret already exists.")
1063
+ return True, secret_value
1064
+ except secretsmanager_client.exceptions.ResourceNotFoundException:
1065
+ print("Secret not found")
1066
+ return False, {}
1067
+ except Exception as e:
1068
+ # Handle other potential exceptions during the get operation
1069
+ print(f"Error checking for secret: {e}")
1070
+ return False, {}
1071
+
1072
+
1073
+ def get_security_group_id_by_name(
1074
+ group_name: str,
1075
+ vpc_id: str,
1076
+ region_name: str = AWS_REGION,
1077
+ ) -> Tuple[bool, str]:
1078
+ """Look up a security group ID by name within a VPC."""
1079
+ if not group_name or not vpc_id:
1080
+ return False, ""
1081
+ try:
1082
+ ec2_client = boto3.client("ec2", region_name=region_name)
1083
+ response = ec2_client.describe_security_groups(
1084
+ Filters=[
1085
+ {"Name": "group-name", "Values": [group_name]},
1086
+ {"Name": "vpc-id", "Values": [vpc_id]},
1087
+ ]
1088
+ )
1089
+ groups = response.get("SecurityGroups") or []
1090
+ if groups:
1091
+ return True, groups[0]["GroupId"]
1092
+ return False, ""
1093
+ except ClientError as e:
1094
+ print(f"Error looking up security group '{group_name}': {e}")
1095
+ return False, ""
1096
+
1097
+
1098
+ def resolve_service_connect_client_security_group_ids(
1099
+ explicit_ids: List[str],
1100
+ security_group_names: List[str],
1101
+ get_context_str,
1102
+ ) -> List[str]:
1103
+ """
1104
+ Merge explicit sg- IDs with IDs resolved from pre-check context (security_group_id:{name}).
1105
+ """
1106
+ resolved: List[str] = []
1107
+ for sg_id in explicit_ids:
1108
+ if not sg_id.startswith("sg-"):
1109
+ raise ValueError(
1110
+ f"ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS entry '{sg_id}' "
1111
+ "must be a security group ID (sg-...)."
1112
+ )
1113
+ if sg_id not in resolved:
1114
+ resolved.append(sg_id)
1115
+
1116
+ missing_names: List[str] = []
1117
+ for sg_name in security_group_names:
1118
+ sg_id = get_context_str(f"security_group_id:{sg_name}")
1119
+ if sg_id:
1120
+ if sg_id not in resolved:
1121
+ resolved.append(sg_id)
1122
+ else:
1123
+ missing_names.append(sg_name)
1124
+
1125
+ if missing_names:
1126
+ raise ValueError(
1127
+ "Could not resolve Service Connect client security group(s) in VPC "
1128
+ f"{get_context_str('vpc_id') or '(unknown)'}: "
1129
+ + ", ".join(missing_names)
1130
+ + ". Set ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_IDS, fix "
1131
+ "ECS_SERVICE_CONNECT_CLIENT_SECURITY_GROUP_NAMES / "
1132
+ "ECS_SERVICE_CONNECT_CLIENT_CDK_PREFIXES, and re-run check_resources.py."
1133
+ )
1134
+
1135
+ return resolved
1136
+
1137
+
1138
+ def check_alb_exists(
1139
+ load_balancer_name: str, region_name: str = None
1140
+ ) -> tuple[bool, dict]:
1141
+ """
1142
+ Checks if an Application Load Balancer (ALB) with the given name exists.
1143
+
1144
+ Args:
1145
+ load_balancer_name: The name of the ALB to check.
1146
+ region_name: The AWS region to check in. If None, uses the default
1147
+ session region.
1148
+
1149
+ Returns:
1150
+ A tuple:
1151
+ - The first element is True if the ALB exists, False otherwise.
1152
+ - The second element is the ALB object (dictionary) if found,
1153
+ None otherwise. Specifically, it returns the first element of
1154
+ the LoadBalancers list from the describe_load_balancers response.
1155
+ """
1156
+ if region_name:
1157
+ elbv2_client = boto3.client("elbv2", region_name=region_name)
1158
+ else:
1159
+ elbv2_client = boto3.client("elbv2")
1160
+ try:
1161
+ response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
1162
+ if response["LoadBalancers"]:
1163
+ return (
1164
+ True,
1165
+ response["LoadBalancers"][0],
1166
+ ) # Return True and the first ALB object
1167
+ else:
1168
+ return False, {}
1169
+ except ClientError as e:
1170
+ # If the error indicates the ALB doesn't exist, return False
1171
+ if e.response["Error"]["Code"] == "LoadBalancerNotFound":
1172
+ return False, {}
1173
+ else:
1174
+ # Re-raise other exceptions
1175
+ raise
1176
+ except Exception as e:
1177
+ print(f"An unexpected error occurred: {e}")
1178
+ return False, {}
1179
+
1180
+
1181
+ def check_fargate_task_definition_exists(
1182
+ task_definition_name: str, region_name: str = None
1183
+ ) -> tuple[bool, dict]:
1184
+ """
1185
+ Checks if a Fargate task definition with the given name exists.
1186
+
1187
+ Args:
1188
+ task_definition_name: The name or ARN of the task definition to check.
1189
+ region_name: The AWS region to check in. If None, uses the default
1190
+ session region.
1191
+
1192
+ Returns:
1193
+ A tuple:
1194
+ - The first element is True if the task definition exists, False otherwise.
1195
+ - The second element is the task definition object (dictionary) if found,
1196
+ None otherwise. Specifically, it returns the first element of the
1197
+ taskDefinitions list from the describe_task_definition response.
1198
+ """
1199
+ if region_name:
1200
+ ecs_client = boto3.client("ecs", region_name=region_name)
1201
+ else:
1202
+ ecs_client = boto3.client("ecs")
1203
+ try:
1204
+ response = ecs_client.describe_task_definition(
1205
+ taskDefinition=task_definition_name
1206
+ )
1207
+ # If describe_task_definition succeeds, it returns the task definition.
1208
+ # We can directly return True and the task definition.
1209
+ return True, response["taskDefinition"]
1210
+ except ClientError as e:
1211
+ # Check for the error code indicating the task definition doesn't exist.
1212
+ if (
1213
+ e.response["Error"]["Code"] == "ClientException"
1214
+ and "Task definition" in e.response["Message"]
1215
+ and "does not exist" in e.response["Message"]
1216
+ ):
1217
+ return False, {}
1218
+ else:
1219
+ # Re-raise other exceptions.
1220
+ raise
1221
+ except Exception as e:
1222
+ print(f"An unexpected error occurred: {e}")
1223
+ return False, {}
1224
+
1225
+
1226
+ def check_ecs_service_exists(
1227
+ cluster_name: str, service_name: str, region_name: str = None
1228
+ ) -> tuple[bool, dict]:
1229
+ """
1230
+ Checks if an ECS service with the given name exists in the specified cluster.
1231
+
1232
+ Args:
1233
+ cluster_name: The name or ARN of the ECS cluster.
1234
+ service_name: The name of the ECS service to check.
1235
+ region_name: The AWS region to check in. If None, uses the default
1236
+ session region.
1237
+
1238
+ Returns:
1239
+ A tuple:
1240
+ - The first element is True if the service exists, False otherwise.
1241
+ - The second element is the service object (dictionary) if found,
1242
+ None otherwise.
1243
+ """
1244
+ if region_name:
1245
+ ecs_client = boto3.client("ecs", region_name=region_name)
1246
+ else:
1247
+ ecs_client = boto3.client("ecs")
1248
+ try:
1249
+ response = ecs_client.describe_services(
1250
+ cluster=cluster_name, services=[service_name]
1251
+ )
1252
+ if response["services"]:
1253
+ return (
1254
+ True,
1255
+ response["services"][0],
1256
+ ) # Return True and the first service object
1257
+ else:
1258
+ return False, {}
1259
+ except ClientError as e:
1260
+ # Check for the error code indicating the service doesn't exist.
1261
+ if e.response["Error"]["Code"] == "ClusterNotFoundException":
1262
+ return False, {}
1263
+ elif e.response["Error"]["Code"] == "ServiceNotFoundException":
1264
+ return False, {}
1265
+ else:
1266
+ # Re-raise other exceptions.
1267
+ raise
1268
+ except Exception as e:
1269
+ print(f"An unexpected error occurred: {e}")
1270
+ return False, {}
1271
+
1272
+
1273
+ def check_cloudfront_distribution_exists(
1274
+ distribution_name: str, region_name: str = None
1275
+ ) -> tuple[bool, dict | None]:
1276
+ """
1277
+ Checks if a CloudFront distribution with the given name exists.
1278
+
1279
+ Args:
1280
+ distribution_name: The name of the CloudFront distribution to check.
1281
+ region_name: The AWS region to check in. If None, uses the default
1282
+ session region. Note: CloudFront is a global service,
1283
+ so the region is usually 'us-east-1', but this parameter
1284
+ is included for completeness.
1285
+
1286
+ Returns:
1287
+ A tuple:
1288
+ - The first element is True if the distribution exists, False otherwise.
1289
+ - The second element is the distribution object (dictionary) if found,
1290
+ None otherwise. Specifically, it returns the first element of the
1291
+ DistributionList from the ListDistributions response.
1292
+ """
1293
+ if region_name:
1294
+ cf_client = boto3.client("cloudfront", region_name=region_name)
1295
+ else:
1296
+ cf_client = boto3.client("cloudfront")
1297
+ try:
1298
+ response = cf_client.list_distributions()
1299
+ if "Items" in response["DistributionList"]:
1300
+ for distribution in response["DistributionList"]["Items"]:
1301
+ # CloudFront doesn't directly filter by name, so we have to iterate.
1302
+ if (
1303
+ distribution["AliasSet"]["Items"]
1304
+ and distribution["AliasSet"]["Items"][0] == distribution_name
1305
+ ):
1306
+ return True, distribution
1307
+ return False, None
1308
+ else:
1309
+ return False, None
1310
+ except ClientError as e:
1311
+ # If the error indicates the Distribution doesn't exist, return False
1312
+ if e.response["Error"]["Code"] == "NoSuchDistribution":
1313
+ return False, None
1314
+ else:
1315
+ # Re-raise other exceptions
1316
+ raise
1317
+ except Exception as e:
1318
+ print(f"An unexpected error occurred: {e}")
1319
+ return False, None
1320
+
1321
+
1322
+ def create_web_acl_with_common_rules(
1323
+ scope: Construct, web_acl_name: str, waf_scope: str = "CLOUDFRONT"
1324
+ ):
1325
+ """
1326
+ Use CDK to create a web ACL based on an AWS common rule set with overrides.
1327
+ This function now expects a 'scope' argument, typically 'self' from your stack,
1328
+ as CfnWebACL requires a construct scope.
1329
+ """
1330
+
1331
+ # Create full list of rules
1332
+ rules = []
1333
+ aws_ruleset_names = [
1334
+ "AWSManagedRulesCommonRuleSet",
1335
+ "AWSManagedRulesKnownBadInputsRuleSet",
1336
+ "AWSManagedRulesAmazonIpReputationList",
1337
+ ]
1338
+
1339
+ # Use a separate counter to assign unique priorities sequentially
1340
+ priority_counter = 1
1341
+
1342
+ for aws_rule_name in aws_ruleset_names:
1343
+ current_rule_action_overrides = None
1344
+
1345
+ # All managed rule groups need an override_action.
1346
+ # 'none' means use the managed rule group's default action.
1347
+ current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
1348
+
1349
+ current_priority = priority_counter
1350
+ priority_counter += 1
1351
+
1352
+ if aws_rule_name == "AWSManagedRulesCommonRuleSet":
1353
+ current_rule_action_overrides = [
1354
+ wafv2.CfnWebACL.RuleActionOverrideProperty(
1355
+ name="SizeRestrictions_BODY",
1356
+ action_to_use=wafv2.CfnWebACL.RuleActionProperty(allow={}),
1357
+ )
1358
+ ]
1359
+ # No need to set current_override_action here, it's already set above.
1360
+ # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
1361
+ # For now, it will get priority 1 from the counter.
1362
+
1363
+ rule_property = wafv2.CfnWebACL.RuleProperty(
1364
+ name=aws_rule_name,
1365
+ priority=current_priority,
1366
+ statement=wafv2.CfnWebACL.StatementProperty(
1367
+ managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
1368
+ vendor_name="AWS",
1369
+ name=aws_rule_name,
1370
+ rule_action_overrides=current_rule_action_overrides,
1371
+ )
1372
+ ),
1373
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1374
+ cloud_watch_metrics_enabled=True,
1375
+ metric_name=aws_rule_name,
1376
+ sampled_requests_enabled=True,
1377
+ ),
1378
+ override_action=current_override_action, # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
1379
+ )
1380
+
1381
+ rules.append(rule_property)
1382
+
1383
+ # Add the rate limit rule
1384
+ rate_limit_priority = priority_counter # Use the next available priority
1385
+ rules.append(
1386
+ wafv2.CfnWebACL.RuleProperty(
1387
+ name="RateLimitRule",
1388
+ priority=rate_limit_priority,
1389
+ statement=wafv2.CfnWebACL.StatementProperty(
1390
+ rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
1391
+ limit=1000, aggregate_key_type="IP"
1392
+ )
1393
+ ),
1394
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1395
+ cloud_watch_metrics_enabled=True,
1396
+ metric_name="RateLimitRule",
1397
+ sampled_requests_enabled=True,
1398
+ ),
1399
+ action=wafv2.CfnWebACL.RuleActionProperty(block={}),
1400
+ )
1401
+ )
1402
+
1403
+ web_acl = wafv2.CfnWebACL(
1404
+ scope,
1405
+ "WebACL",
1406
+ name=web_acl_name,
1407
+ default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
1408
+ scope=waf_scope,
1409
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
1410
+ cloud_watch_metrics_enabled=True,
1411
+ metric_name="webACL",
1412
+ sampled_requests_enabled=True,
1413
+ ),
1414
+ rules=rules,
1415
+ )
1416
+
1417
+ CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
1418
+
1419
+ return web_acl
1420
+
1421
+
1422
+ def check_web_acl_exists(
1423
+ web_acl_name: str, scope: str, region_name: str = None
1424
+ ) -> tuple[bool, dict]:
1425
+ """
1426
+ Checks if a Web ACL with the given name and scope exists.
1427
+
1428
+ Args:
1429
+ web_acl_name: The name of the Web ACL to check.
1430
+ scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
1431
+ region_name: The AWS region to check in. Required for REGIONAL scope.
1432
+ If None, uses the default session region. For CLOUDFRONT,
1433
+ the region should be 'us-east-1'.
1434
+
1435
+ Returns:
1436
+ A tuple:
1437
+ - The first element is True if the Web ACL exists, False otherwise.
1438
+ - The second element is the Web ACL object (dictionary) if found,
1439
+ None otherwise.
1440
+ """
1441
+ if scope not in ["CLOUDFRONT", "REGIONAL"]:
1442
+ raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
1443
+
1444
+ if scope == "REGIONAL" and not region_name:
1445
+ raise ValueError("Region name is required for REGIONAL scope")
1446
+
1447
+ if scope == "CLOUDFRONT":
1448
+ region_name = "us-east-1" # CloudFront scope requires us-east-1
1449
+
1450
+ if region_name:
1451
+ waf_client = boto3.client("wafv2", region_name=region_name)
1452
+ else:
1453
+ waf_client = boto3.client("wafv2")
1454
+ try:
1455
+ response = waf_client.list_web_acls(Scope=scope)
1456
+ if "WebACLs" in response:
1457
+ for web_acl in response["WebACLs"]:
1458
+ if web_acl["Name"] == web_acl_name:
1459
+ # Describe the Web ACL to get the full object.
1460
+ describe_response = waf_client.describe_web_acl(
1461
+ Name=web_acl_name, Scope=scope
1462
+ )
1463
+ return True, describe_response["WebACL"]
1464
+ return False, {}
1465
+ else:
1466
+ return False, {}
1467
+ except ClientError as e:
1468
+ # Check for the error code indicating the web ACL doesn't exist.
1469
+ if e.response["Error"]["Code"] == "ResourceNotFoundException":
1470
+ return False, {}
1471
+ else:
1472
+ # Re-raise other exceptions.
1473
+ raise
1474
+ except Exception as e:
1475
+ print(f"An unexpected error occurred: {e}")
1476
+ return False, {}
1477
+
1478
+
1479
+ def add_alb_https_listener_with_cert(
1480
+ scope: Construct,
1481
+ logical_id: str, # A unique ID for this listener construct
1482
+ alb: elb.ApplicationLoadBalancer,
1483
+ acm_certificate_arn: Optional[
1484
+ str
1485
+ ], # Optional: If None, no HTTPS listener will be created
1486
+ default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
1487
+ listener_port_https: int = 443,
1488
+ listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
1489
+ # --- Cognito Authentication Parameters ---
1490
+ enable_cognito_auth: bool = False,
1491
+ cognito_user_pool: Optional[cognito.IUserPool] = None,
1492
+ cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
1493
+ cognito_user_pool_domain: Optional[
1494
+ str
1495
+ ] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
1496
+ cognito_auth_scope: Optional[
1497
+ str
1498
+ ] = "openid profile email", # Default recommended scope
1499
+ cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
1500
+ stickiness_cookie_duration=None,
1501
+ # --- End Cognito Parameters ---
1502
+ ) -> Optional[elb.ApplicationListener]:
1503
+ """
1504
+ Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
1505
+ and optionally enables Cognito User Pool authentication.
1506
+
1507
+ Args:
1508
+ scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
1509
+ logical_id (str): A unique logical ID for the listener construct within the stack.
1510
+ alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
1511
+ acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
1512
+ If None, the HTTPS listener will NOT be created.
1513
+ default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
1514
+ This is mandatory for a functional listener.
1515
+ listener_port_https (int): The HTTPS port to listen on (default: 443).
1516
+ listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
1517
+ If False (recommended), ensure your ALB's security group allows
1518
+ inbound traffic on this port from desired sources.
1519
+ enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
1520
+ cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
1521
+ cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
1522
+ cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
1523
+ cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
1524
+ cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
1525
+ Defaults to AUTHENTICATE (redirect to login).
1526
+
1527
+ Returns:
1528
+ Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
1529
+ None if no ACM certificate ARN was provided.
1530
+ """
1531
+ https_listener = None
1532
+ if acm_certificate_arn:
1533
+ certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
1534
+ print(
1535
+ f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}"
1536
+ )
1537
+
1538
+ # Determine the default action based on whether Cognito auth is enabled
1539
+ default_action = None
1540
+ if enable_cognito_auth is True:
1541
+ if not all(
1542
+ [cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]
1543
+ ):
1544
+ raise ValueError(
1545
+ "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
1546
+ )
1547
+ print(
1548
+ f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}"
1549
+ )
1550
+
1551
+ default_action = elb_act.AuthenticateCognitoAction(
1552
+ next=elb.ListenerAction.forward(
1553
+ [default_target_group]
1554
+ ), # After successful auth, forward to TG
1555
+ user_pool=cognito_user_pool,
1556
+ user_pool_client=cognito_user_pool_client,
1557
+ user_pool_domain=cognito_user_pool_domain,
1558
+ scope=cognito_auth_scope,
1559
+ on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
1560
+ session_timeout=stickiness_cookie_duration,
1561
+ # Additional options you might want to configure:
1562
+ # session_cookie_name="AWSELBCookies"
1563
+ )
1564
+ else:
1565
+ default_action = elb.ListenerAction.forward([default_target_group])
1566
+ print("Cognito authentication is NOT enabled for this listener.")
1567
+
1568
+ # Add the HTTPS listener
1569
+ https_listener = alb.add_listener(
1570
+ logical_id,
1571
+ port=listener_port_https,
1572
+ open=listener_open_to_internet,
1573
+ certificates=certificates_list,
1574
+ default_action=default_action, # Use the determined default action
1575
+ )
1576
+ print(f"ALB HTTPS listener on port {listener_port_https} defined.")
1577
+ else:
1578
+ print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
1579
+
1580
+ return https_listener
1581
+
1582
+
1583
+ def create_ecs_express_infrastructure_role(
1584
+ scope: Construct,
1585
+ logical_id: str,
1586
+ role_name: str,
1587
+ ) -> iam.Role:
1588
+ """IAM role for ECS Express Mode to provision ALB, ACM cert, and autoscaling."""
1589
+ role = iam.Role(
1590
+ scope,
1591
+ logical_id,
1592
+ role_name=role_name,
1593
+ assumed_by=iam.ServicePrincipal("ecs.amazonaws.com"),
1594
+ )
1595
+ role.add_managed_policy(
1596
+ iam.ManagedPolicy.from_aws_managed_policy_name(
1597
+ "AmazonECSInfrastructureRoleforExpressGatewayServices"
1598
+ )
1599
+ )
1600
+ return role
1601
+
1602
+
1603
+ def _secret_value_from_arn(secret_arn: str, json_key: str) -> str:
1604
+ return f"{secret_arn}:{json_key}::"
1605
+
1606
+
1607
+ # Injected via Express `secrets`, not plain environment (avoid duplication/leakage).
1608
+ _EXPRESS_SECRET_ENV_NAMES = frozenset(
1609
+ {"AWS_USER_POOL_ID", "AWS_CLIENT_ID", "AWS_CLIENT_SECRET"}
1610
+ )
1611
+
1612
+
1613
+ def load_app_config_env_for_express(
1614
+ config_env_path: str,
1615
+ *,
1616
+ exclude_names: Optional[FrozenSet[str]] = None,
1617
+ ) -> List[ecs.CfnExpressGatewayService.KeyValuePairProperty]:
1618
+ """
1619
+ Load KEY=VALUE pairs from config/config.env for Express PrimaryContainer.environment.
1620
+
1621
+ Uses the same file written by create_basic_config_env() and uploaded to S3 on the
1622
+ legacy Fargate path (environmentFiles).
1623
+ """
1624
+ exclude = exclude_names or _EXPRESS_SECRET_ENV_NAMES
1625
+ path = os.path.abspath(config_env_path)
1626
+ if not os.path.isfile(path):
1627
+ print(
1628
+ f"Warning: app config env file not found at {path}; "
1629
+ "Express container will not receive app config environment variables."
1630
+ )
1631
+ return []
1632
+
1633
+ raw = dotenv_values(path)
1634
+ environment: List[ecs.CfnExpressGatewayService.KeyValuePairProperty] = []
1635
+ for name, value in sorted(raw.items()):
1636
+ if not name or value is None or name in exclude:
1637
+ continue
1638
+ environment.append(
1639
+ ecs.CfnExpressGatewayService.KeyValuePairProperty(
1640
+ name=name,
1641
+ value=str(value),
1642
+ )
1643
+ )
1644
+ print(
1645
+ f"Loaded {len(environment)} environment variables from {path} for ECS Express Mode."
1646
+ )
1647
+ return environment
1648
+
1649
+
1650
+ def build_express_gateway_primary_container(
1651
+ *,
1652
+ image_uri: str,
1653
+ container_port: int,
1654
+ log_group_name: str,
1655
+ aws_region: str,
1656
+ secret: secretsmanager.ISecret,
1657
+ environment: Optional[
1658
+ List[ecs.CfnExpressGatewayService.KeyValuePairProperty]
1659
+ ] = None,
1660
+ ) -> ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty:
1661
+ secret_arn = secret.secret_arn
1662
+ return ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty(
1663
+ image=image_uri,
1664
+ container_port=container_port,
1665
+ aws_logs_configuration=ecs.CfnExpressGatewayService.ExpressGatewayServiceAwsLogsConfigurationProperty(
1666
+ log_group_name=log_group_name,
1667
+ log_stream_prefix="ecs",
1668
+ region=aws_region,
1669
+ ),
1670
+ environment=environment or None,
1671
+ secrets=[
1672
+ ecs.CfnExpressGatewayService.SecretProperty(
1673
+ name="AWS_USER_POOL_ID",
1674
+ value_from=_secret_value_from_arn(secret_arn, "REDACTION_USER_POOL_ID"),
1675
+ ),
1676
+ ecs.CfnExpressGatewayService.SecretProperty(
1677
+ name="AWS_CLIENT_ID",
1678
+ value_from=_secret_value_from_arn(secret_arn, "REDACTION_CLIENT_ID"),
1679
+ ),
1680
+ ecs.CfnExpressGatewayService.SecretProperty(
1681
+ name="AWS_CLIENT_SECRET",
1682
+ value_from=_secret_value_from_arn(
1683
+ secret_arn, "REDACTION_CLIENT_SECRET"
1684
+ ),
1685
+ ),
1686
+ ],
1687
+ )
1688
+
1689
+
1690
+ def create_express_gateway_service(
1691
+ scope: Construct,
1692
+ logical_id: str,
1693
+ *,
1694
+ service_name: str,
1695
+ cluster_name: str,
1696
+ execution_role_arn: str,
1697
+ infrastructure_role_arn: str,
1698
+ task_role_arn: str,
1699
+ cpu: str,
1700
+ memory: str,
1701
+ health_check_path: str,
1702
+ primary_container: ecs.CfnExpressGatewayService.ExpressGatewayContainerProperty,
1703
+ subnet_ids: List[str],
1704
+ security_group_ids: List[str],
1705
+ ) -> ecs.CfnExpressGatewayService:
1706
+ network = None
1707
+ if subnet_ids or security_group_ids:
1708
+ network = ecs.CfnExpressGatewayService.ExpressGatewayServiceNetworkConfigurationProperty(
1709
+ subnets=subnet_ids or None,
1710
+ security_groups=security_group_ids or None,
1711
+ )
1712
+ express_service = ecs.CfnExpressGatewayService(
1713
+ scope,
1714
+ logical_id,
1715
+ service_name=service_name,
1716
+ cluster=cluster_name,
1717
+ execution_role_arn=execution_role_arn,
1718
+ infrastructure_role_arn=infrastructure_role_arn,
1719
+ task_role_arn=task_role_arn,
1720
+ cpu=cpu,
1721
+ memory=memory,
1722
+ health_check_path=health_check_path,
1723
+ primary_container=primary_container,
1724
+ network_configuration=network,
1725
+ )
1726
+ return express_service
1727
+
1728
+
1729
+ def _forward_target_group_action(
1730
+ target_group_arn: str,
1731
+ stickiness_seconds: int,
1732
+ ) -> Dict[str, Any]:
1733
+ action: Dict[str, Any] = {
1734
+ "Type": "forward",
1735
+ "Order": 2,
1736
+ "ForwardConfig": {
1737
+ "TargetGroups": [{"TargetGroupArn": target_group_arn}],
1738
+ },
1739
+ }
1740
+ if stickiness_seconds > 0:
1741
+ action["ForwardConfig"]["TargetGroupStickinessConfig"] = {
1742
+ "Enabled": True,
1743
+ "DurationSeconds": stickiness_seconds,
1744
+ }
1745
+ return action
1746
+
1747
+
1748
+ def build_cognito_default_listener_actions(
1749
+ *,
1750
+ user_pool_arn: str,
1751
+ user_pool_client_id: str,
1752
+ user_pool_domain_prefix: str,
1753
+ target_group_arn: str,
1754
+ stickiness_seconds: int = 28800,
1755
+ scope: str = "openid email profile",
1756
+ ) -> List[Dict[str, Any]]:
1757
+ """Default actions for ELBv2 ModifyListener (authenticate-cognito + forward)."""
1758
+ return [
1759
+ {
1760
+ "Type": "authenticate-cognito",
1761
+ "Order": 1,
1762
+ "AuthenticateCognitoConfig": {
1763
+ "UserPoolArn": user_pool_arn,
1764
+ "UserPoolClientId": user_pool_client_id,
1765
+ "UserPoolDomain": user_pool_domain_prefix,
1766
+ "Scope": scope,
1767
+ "OnUnauthenticatedRequest": "authenticate",
1768
+ "SessionTimeout": stickiness_seconds,
1769
+ },
1770
+ },
1771
+ _forward_target_group_action(target_group_arn, stickiness_seconds),
1772
+ ]
1773
+
1774
+
1775
+ def configure_express_listener_cognito_and_cloudfront(
1776
+ scope: Construct,
1777
+ logical_id_prefix: str,
1778
+ *,
1779
+ express_service: ecs.CfnExpressGatewayService,
1780
+ user_pool_arn: str,
1781
+ user_pool_client_id: str,
1782
+ user_pool_domain_prefix: str,
1783
+ use_cloudfront: bool,
1784
+ cloudfront_host_header: str,
1785
+ stickiness_seconds: int = 28800,
1786
+ ) -> None:
1787
+ """
1788
+ Attach Cognito auth to the Express-managed HTTPS listener and optionally add a
1789
+ CloudFront host-header rule (same pattern as the legacy HTTP listener path).
1790
+ """
1791
+ listener_arn = express_service.get_att(
1792
+ "ECSManagedResourceArns.IngressPath.ListenerArn"
1793
+ ).to_string()
1794
+ target_group_arn = Fn.select(
1795
+ 0,
1796
+ express_service.get_att("ECSManagedResourceArns.IngressPath.TargetGroupArns"),
1797
+ )
1798
+ default_actions = build_cognito_default_listener_actions(
1799
+ user_pool_arn=user_pool_arn,
1800
+ user_pool_client_id=user_pool_client_id,
1801
+ user_pool_domain_prefix=user_pool_domain_prefix,
1802
+ target_group_arn=target_group_arn,
1803
+ stickiness_seconds=stickiness_seconds,
1804
+ )
1805
+ modify_listener = cr.AwsCustomResource(
1806
+ scope,
1807
+ f"{logical_id_prefix}ModifyExpressListener",
1808
+ on_create=cr.AwsSdkCall(
1809
+ service="ELBv2",
1810
+ action="modifyListener",
1811
+ parameters={
1812
+ "ListenerArn": listener_arn,
1813
+ "DefaultActions": default_actions,
1814
+ },
1815
+ physical_resource_id=cr.PhysicalResourceId.of(
1816
+ f"express-listener-cognito-{logical_id_prefix}"
1817
+ ),
1818
+ ),
1819
+ on_update=cr.AwsSdkCall(
1820
+ service="ELBv2",
1821
+ action="modifyListener",
1822
+ parameters={
1823
+ "ListenerArn": listener_arn,
1824
+ "DefaultActions": default_actions,
1825
+ },
1826
+ physical_resource_id=cr.PhysicalResourceId.of(
1827
+ f"express-listener-cognito-{logical_id_prefix}"
1828
+ ),
1829
+ ),
1830
+ policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
1831
+ resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
1832
+ ),
1833
+ )
1834
+ modify_listener.node.add_dependency(express_service)
1835
+
1836
+ if use_cloudfront and cloudfront_host_header:
1837
+ forward_only = [
1838
+ {
1839
+ "Type": "forward",
1840
+ "Order": 1,
1841
+ "ForwardConfig": {
1842
+ "TargetGroups": [{"TargetGroupArn": target_group_arn}],
1843
+ "TargetGroupStickinessConfig": {
1844
+ "Enabled": True,
1845
+ "DurationSeconds": stickiness_seconds,
1846
+ },
1847
+ },
1848
+ }
1849
+ ]
1850
+ cf_rule = cr.AwsCustomResource(
1851
+ scope,
1852
+ f"{logical_id_prefix}ExpressCloudFrontHostRule",
1853
+ on_create=cr.AwsSdkCall(
1854
+ service="ELBv2",
1855
+ action="createRule",
1856
+ parameters={
1857
+ "ListenerArn": listener_arn,
1858
+ "Priority": 1,
1859
+ "Conditions": [
1860
+ {
1861
+ "Field": "host-header",
1862
+ "HostHeaderConfig": {"Values": [cloudfront_host_header]},
1863
+ }
1864
+ ],
1865
+ "Actions": forward_only,
1866
+ },
1867
+ physical_resource_id=cr.PhysicalResourceId.from_response(
1868
+ "Rules[0].RuleArn"
1869
+ ),
1870
+ ),
1871
+ on_delete=cr.AwsSdkCall(
1872
+ service="ELBv2",
1873
+ action="deleteRule",
1874
+ parameters={"RuleArn": cr.PhysicalResourceId.reference()},
1875
+ ),
1876
+ policy=cr.AwsCustomResourcePolicy.from_sdk_calls(
1877
+ resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE
1878
+ ),
1879
+ )
1880
+ cf_rule.node.add_dependency(modify_listener)
1881
+
1882
+
1883
+ def allow_express_load_balancer_to_ecs_security_group(
1884
+ scope: Construct,
1885
+ logical_id: str,
1886
+ *,
1887
+ express_service: ecs.CfnExpressGatewayService,
1888
+ ecs_security_group: ec2.ISecurityGroup,
1889
+ container_port: int,
1890
+ ) -> None:
1891
+ """Allow traffic from the Express-managed ALB security group to the task SG."""
1892
+ lb_sg_arn = Fn.select(
1893
+ 0,
1894
+ express_service.get_att(
1895
+ "ECSManagedResourceArns.IngressPath.LoadBalancerSecurityGroups"
1896
+ ),
1897
+ )
1898
+ ec2.CfnSecurityGroupIngress(
1899
+ scope,
1900
+ logical_id,
1901
+ group_id=ecs_security_group.security_group_id,
1902
+ ip_protocol="tcp",
1903
+ from_port=container_port,
1904
+ to_port=container_port,
1905
+ source_security_group_id=lb_sg_arn,
1906
+ description="Express Mode ALB to ECS tasks",
1907
+ )
1908
+
1909
+
1910
+ def create_s3_batch_ecs_trigger_lambda(
1911
+ scope: Construct,
1912
+ logical_id: str,
1913
+ *,
1914
+ function_name: Optional[str],
1915
+ lambda_asset_path: str,
1916
+ output_bucket: s3.IBucket,
1917
+ config_bucket: s3.IBucket,
1918
+ cluster_name: str,
1919
+ task_definition_arn: str,
1920
+ container_name: str,
1921
+ subnet_ids: List[str],
1922
+ security_group_id: str,
1923
+ execution_role: iam.IRole,
1924
+ task_role: iam.IRole,
1925
+ env_prefix: str,
1926
+ env_suffix: str,
1927
+ input_prefix: str,
1928
+ config_prefix: str,
1929
+ default_params_key: str,
1930
+ default_direct_mode_task: str = "redact",
1931
+ ) -> lambda_.Function:
1932
+ """
1933
+ Lambda triggered by job .env uploads on the output bucket; runs one-shot Fargate tasks.
1934
+ """
1935
+ lambda_role = iam.Role(
1936
+ scope,
1937
+ f"{logical_id}Role",
1938
+ assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
1939
+ managed_policies=[
1940
+ iam.ManagedPolicy.from_aws_managed_policy_name(
1941
+ "service-role/AWSLambdaBasicExecutionRole"
1942
+ )
1943
+ ],
1944
+ )
1945
+
1946
+ lambda_role.add_to_policy(
1947
+ iam.PolicyStatement(
1948
+ actions=["ecs:RunTask"],
1949
+ resources=[task_definition_arn],
1950
+ )
1951
+ )
1952
+ lambda_role.add_to_policy(
1953
+ iam.PolicyStatement(
1954
+ actions=["ecs:RunTask"],
1955
+ resources=[
1956
+ f"arn:aws:ecs:*:*:cluster/{cluster_name}",
1957
+ ],
1958
+ )
1959
+ )
1960
+ lambda_role.add_to_policy(
1961
+ iam.PolicyStatement(
1962
+ actions=["iam:PassRole"],
1963
+ resources=[execution_role.role_arn, task_role.role_arn],
1964
+ conditions={
1965
+ "StringEquals": {"iam:PassedToService": "ecs-tasks.amazonaws.com"}
1966
+ },
1967
+ )
1968
+ )
1969
+ output_bucket.grant_read(lambda_role, f"{env_prefix}*")
1970
+ config_bucket.grant_read(lambda_role)
1971
+ if default_params_key:
1972
+ output_bucket.grant_read(lambda_role, default_params_key)
1973
+
1974
+ fn_kwargs: Dict[str, Any] = {
1975
+ "runtime": lambda_.Runtime.PYTHON_3_12,
1976
+ "handler": "lambda_function.lambda_handler",
1977
+ "code": lambda_.Code.from_asset(lambda_asset_path),
1978
+ "role": lambda_role,
1979
+ "timeout": Duration.seconds(60),
1980
+ "memory_size": 256,
1981
+ "environment": {
1982
+ "OUTPUT_BUCKET": output_bucket.bucket_name,
1983
+ "CONFIG_BUCKET": config_bucket.bucket_name,
1984
+ "INPUT_PREFIX": input_prefix,
1985
+ "CONFIG_PREFIX": config_prefix,
1986
+ "ENV_PREFIX": env_prefix,
1987
+ "ENV_SUFFIX": env_suffix,
1988
+ "DEFAULT_PARAMS_KEY": default_params_key,
1989
+ "ECS_CLUSTER": cluster_name,
1990
+ "ECS_TASK_DEF": task_definition_arn,
1991
+ "SUBNETS": ",".join(subnet_ids),
1992
+ "SECURITY_GROUPS": security_group_id,
1993
+ "CONTAINER_NAME": container_name,
1994
+ "DEFAULT_DIRECT_MODE_TASK": default_direct_mode_task,
1995
+ },
1996
+ }
1997
+ if function_name:
1998
+ fn_kwargs["function_name"] = function_name
1999
+
2000
+ batch_fn = lambda_.Function(scope, logical_id, **fn_kwargs)
2001
+
2002
+ output_bucket.add_event_notification(
2003
+ s3.EventType.OBJECT_CREATED,
2004
+ s3n.LambdaDestination(batch_fn),
2005
+ s3.NotificationKeyFilter(prefix=env_prefix, suffix=env_suffix),
2006
+ )
2007
+
2008
+ return batch_fn
2009
+
2010
+
2011
+ def build_pi_agent_container_environment(
2012
+ *,
2013
+ service_connect_discovery_name: str,
2014
+ main_app_port: Union[str, int],
2015
+ pi_gradio_port: Union[str, int],
2016
+ ) -> Dict[str, str]:
2017
+ """Inline env for Pi agent tasks (overrides image defaults; SC URL for main app)."""
2018
+ port = int(main_app_port)
2019
+ pi_port = int(pi_gradio_port)
2020
+ return {
2021
+ "APP_TYPE": "pi",
2022
+ "APP_CONFIG_PATH": "/workspace/doc_redaction/config/pi_agent.env",
2023
+ "PI_DEPLOYMENT_PROFILE": "aws-ecs",
2024
+ "PI_DEFAULT_PROVIDER": "amazon-bedrock",
2025
+ "DOC_REDACTION_GRADIO_URL": f"http://{service_connect_discovery_name}:{port}",
2026
+ "PI_GRADIO_PORT": str(pi_port),
2027
+ "GRADIO_SERVER_PORT": str(pi_port),
2028
+ "GRADIO_SERVER_NAME": "0.0.0.0",
2029
+ "PI_WORKSPACE_DIR": "/home/user/app/workspace",
2030
+ "PI_WORKDIR": "/workspace/doc_redaction",
2031
+ "PI_UPLOAD_ROOT": "/tmp/gradio",
2032
+ "PI_SESSION_DIR": "/tmp/pi-sessions",
2033
+ "RUN_FASTAPI": "False",
2034
+ "COGNITO_AUTH": "False",
2035
+ }
2036
+
2037
+
2038
+ def create_pi_agent_ecs_resources(
2039
+ scope: Construct,
2040
+ logical_id_prefix: str,
2041
+ *,
2042
+ vpc: ec2.IVpc,
2043
+ cluster: ecs.ICluster,
2044
+ private_subnets: List[ec2.ISubnet],
2045
+ pi_ecr_image_uri: str,
2046
+ container_name: str,
2047
+ task_role: iam.IRole,
2048
+ execution_role: iam.IRole,
2049
+ config_bucket: s3.IBucket,
2050
+ pi_agent_env_s3_key: str,
2051
+ service_name: str,
2052
+ task_family: str,
2053
+ security_group_name: str,
2054
+ log_group_name: str,
2055
+ cpu: int,
2056
+ memory_mib: int,
2057
+ pi_gradio_port: int,
2058
+ service_connect_namespace: str,
2059
+ service_connect_discovery_name: str,
2060
+ main_app_port: int,
2061
+ use_fargate_spot: str,
2062
+ ) -> Tuple[ecs.FargateService, ec2.SecurityGroup, ecs.FargateTaskDefinition]:
2063
+ """Second Fargate service for the Pi agent (joins Service Connect namespace as a client)."""
2064
+ pi_security_group = ec2.SecurityGroup(
2065
+ scope,
2066
+ f"{logical_id_prefix}SecurityGroup",
2067
+ vpc=vpc,
2068
+ security_group_name=security_group_name,
2069
+ description="Pi agent ECS tasks",
2070
+ )
2071
+
2072
+ pi_log_group = logs.LogGroup(
2073
+ scope,
2074
+ f"{logical_id_prefix}LogGroup",
2075
+ log_group_name=log_group_name,
2076
+ retention=logs.RetentionDays.ONE_MONTH,
2077
+ removal_policy=RemovalPolicy.DESTROY,
2078
+ )
2079
+
2080
+ pi_volume = ecs.Volume(name="piEphemeralVolume")
2081
+ pi_task_definition = ecs.FargateTaskDefinition(
2082
+ scope,
2083
+ f"{logical_id_prefix}TaskDefinition",
2084
+ family=task_family,
2085
+ cpu=cpu,
2086
+ memory_limit_mib=memory_mib,
2087
+ task_role=task_role,
2088
+ execution_role=execution_role,
2089
+ runtime_platform=ecs.RuntimePlatform(
2090
+ cpu_architecture=ecs.CpuArchitecture.X86_64,
2091
+ operating_system_family=ecs.OperatingSystemFamily.LINUX,
2092
+ ),
2093
+ ephemeral_storage_gib=21,
2094
+ volumes=[pi_volume],
2095
+ )
2096
+
2097
+ env_files: List[ecs.EnvironmentFile] = []
2098
+ if pi_agent_env_s3_key:
2099
+ env_files.append(
2100
+ ecs.EnvironmentFile.from_bucket(config_bucket, pi_agent_env_s3_key)
2101
+ )
2102
+
2103
+ pi_container = pi_task_definition.add_container(
2104
+ container_name,
2105
+ image=ecs.ContainerImage.from_registry(f"{pi_ecr_image_uri}:latest"),
2106
+ logging=ecs.LogDriver.aws_logs(
2107
+ stream_prefix="ecs-pi",
2108
+ log_group=pi_log_group,
2109
+ ),
2110
+ environment_files=env_files if env_files else None,
2111
+ environment=build_pi_agent_container_environment(
2112
+ service_connect_discovery_name=service_connect_discovery_name,
2113
+ main_app_port=main_app_port,
2114
+ pi_gradio_port=pi_gradio_port,
2115
+ ),
2116
+ command=[
2117
+ "bash",
2118
+ "-c",
2119
+ "python3 agent-redact/pi/pi_agent_config.py && "
2120
+ "exec python3 agent-redact/pi/gradio_app.py",
2121
+ ],
2122
+ essential=True,
2123
+ )
2124
+
2125
+ pi_container.add_mount_points(
2126
+ ecs.MountPoint(
2127
+ source_volume=pi_volume.name,
2128
+ container_path="/home/user/app/workspace",
2129
+ read_only=False,
2130
+ ),
2131
+ ecs.MountPoint(
2132
+ source_volume=pi_volume.name,
2133
+ container_path="/tmp/gradio",
2134
+ read_only=False,
2135
+ ),
2136
+ ecs.MountPoint(
2137
+ source_volume=pi_volume.name,
2138
+ container_path="/tmp/pi-sessions",
2139
+ read_only=False,
2140
+ ),
2141
+ )
2142
+
2143
+ pi_container.add_port_mappings(
2144
+ ecs.PortMapping(
2145
+ container_port=pi_gradio_port,
2146
+ host_port=pi_gradio_port,
2147
+ name=f"port-{pi_gradio_port}",
2148
+ protocol=ecs.Protocol.TCP,
2149
+ app_protocol=ecs.AppProtocol.http,
2150
+ )
2151
+ )
2152
+
2153
+ pi_service = ecs.FargateService(
2154
+ scope,
2155
+ f"{logical_id_prefix}Service",
2156
+ service_name=service_name,
2157
+ cluster=cluster,
2158
+ task_definition=pi_task_definition,
2159
+ security_groups=[pi_security_group],
2160
+ vpc_subnets=ec2.SubnetSelection(subnets=private_subnets),
2161
+ platform_version=ecs.FargatePlatformVersion.LATEST,
2162
+ capacity_provider_strategies=[
2163
+ ecs.CapacityProviderStrategy(
2164
+ capacity_provider=use_fargate_spot,
2165
+ base=0,
2166
+ weight=1,
2167
+ )
2168
+ ],
2169
+ min_healthy_percent=0,
2170
+ max_healthy_percent=100,
2171
+ desired_count=0,
2172
+ service_connect_configuration=ecs.ServiceConnectProps(
2173
+ namespace=service_connect_namespace,
2174
+ ),
2175
+ )
2176
+
2177
+ return pi_service, pi_security_group, pi_task_definition
2178
+
2179
+
2180
+ def attach_pi_agent_to_shared_alb(
2181
+ scope: Construct,
2182
+ logical_id_prefix: str,
2183
+ *,
2184
+ vpc: ec2.IVpc,
2185
+ alb_security_group: ec2.ISecurityGroup,
2186
+ pi_security_group: ec2.SecurityGroup,
2187
+ pi_service: ecs.FargateService,
2188
+ pi_port: int,
2189
+ pi_host_header: str,
2190
+ listener_rule_priority: int,
2191
+ target_group_name: str,
2192
+ stickiness_cookie_duration: Duration,
2193
+ https_listener: Optional[elb.IApplicationListener],
2194
+ http_listener: Optional[elb.IApplicationListener],
2195
+ acm_certificate_arn: str,
2196
+ enable_cognito_auth: bool,
2197
+ cognito_user_pool: Optional[cognito.IUserPool],
2198
+ cognito_user_pool_client: Optional[cognito.IUserPoolClient],
2199
+ cognito_user_pool_domain: Optional[cognito.IUserPoolDomain],
2200
+ ) -> elb.ApplicationTargetGroup:
2201
+ """Register Pi on the shared legacy ALB (second target group + host-header rules)."""
2202
+ pi_security_group.add_ingress_rule(
2203
+ peer=alb_security_group,
2204
+ connection=ec2.Port.tcp(pi_port),
2205
+ description="Shared ALB to Pi agent",
2206
+ )
2207
+
2208
+ pi_target_group = elb.ApplicationTargetGroup(
2209
+ scope,
2210
+ f"{logical_id_prefix}TargetGroup",
2211
+ target_group_name=target_group_name,
2212
+ port=pi_port,
2213
+ protocol=elb.ApplicationProtocol.HTTP,
2214
+ targets=[pi_service],
2215
+ stickiness_cookie_duration=stickiness_cookie_duration,
2216
+ vpc=vpc,
2217
+ health_check=elb.HealthCheck(
2218
+ path="/",
2219
+ healthy_http_codes="200-399",
2220
+ ),
2221
+ )
2222
+
2223
+ if (
2224
+ enable_cognito_auth
2225
+ and acm_certificate_arn
2226
+ and cognito_user_pool
2227
+ and cognito_user_pool_client
2228
+ and cognito_user_pool_domain
2229
+ and https_listener
2230
+ ):
2231
+ forward_action = elb_act.AuthenticateCognitoAction(
2232
+ next=elb.ListenerAction.forward(
2233
+ [pi_target_group],
2234
+ stickiness_duration=stickiness_cookie_duration,
2235
+ ),
2236
+ user_pool=cognito_user_pool,
2237
+ user_pool_client=cognito_user_pool_client,
2238
+ user_pool_domain=cognito_user_pool_domain,
2239
+ scope="openid profile email",
2240
+ on_unauthenticated_request=elb.UnauthenticatedAction.AUTHENTICATE,
2241
+ session_timeout=stickiness_cookie_duration,
2242
+ )
2243
+ else:
2244
+ forward_action = elb.ListenerAction.forward(
2245
+ [pi_target_group],
2246
+ stickiness_duration=stickiness_cookie_duration,
2247
+ )
2248
+
2249
+ if https_listener:
2250
+ https_listener.add_action(
2251
+ f"{logical_id_prefix}HttpsHostRule",
2252
+ priority=listener_rule_priority,
2253
+ conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
2254
+ action=forward_action,
2255
+ )
2256
+ elif http_listener:
2257
+ http_listener.add_action(
2258
+ f"{logical_id_prefix}HttpHostRule",
2259
+ priority=listener_rule_priority,
2260
+ conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
2261
+ action=forward_action,
2262
+ )
2263
+
2264
+ if http_listener and acm_certificate_arn:
2265
+ http_listener.add_action(
2266
+ f"{logical_id_prefix}HttpRedirectRule",
2267
+ priority=listener_rule_priority,
2268
+ conditions=[elb.ListenerCondition.host_headers([pi_host_header])],
2269
+ action=elb.ListenerAction.redirect(
2270
+ protocol="HTTPS",
2271
+ port="443",
2272
+ host="#{host}",
2273
+ path="/#{path}",
2274
+ query="#{query}",
2275
+ ),
2276
+ )
2277
+
2278
+ return pi_target_group
2279
+
2280
+
2281
+ def ensure_folder_exists(output_folder: str):
2282
+ """Checks if the specified folder exists, creates it if not."""
2283
+
2284
+ if not os.path.exists(output_folder):
2285
+ # Create the folder if it doesn't exist
2286
+ os.makedirs(output_folder, exist_ok=True)
2287
+ print(f"Created the {output_folder} folder.")
2288
+ else:
2289
+ print(f"The {output_folder} folder already exists.")
2290
+
2291
+
2292
+ def create_basic_config_env(
2293
+ out_dir: str = "config",
2294
+ S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME,
2295
+ S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME,
2296
+ ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME,
2297
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
2298
+ USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME,
2299
+ ):
2300
+ """
2301
+ Create a basic config.env file for the user to use with their newly deployed redaction app.
2302
+ """
2303
+ variables = {
2304
+ "COGNITO_AUTH": "True",
2305
+ "RUN_AWS_FUNCTIONS": "True",
2306
+ "DISPLAY_FILE_NAMES_IN_LOGS": "False",
2307
+ "SESSION_OUTPUT_FOLDER": "True",
2308
+ "SAVE_LOGS_TO_DYNAMODB": "True",
2309
+ "SHOW_COSTS": "True",
2310
+ "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS": "True",
2311
+ "LOAD_PREVIOUS_TEXTRACT_JOBS_S3": "True",
2312
+ "DOCUMENT_REDACTION_BUCKET": S3_LOG_CONFIG_BUCKET_NAME,
2313
+ "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET": S3_OUTPUT_BUCKET_NAME,
2314
+ "ACCESS_LOG_DYNAMODB_TABLE_NAME": ACCESS_LOG_DYNAMODB_TABLE_NAME,
2315
+ "FEEDBACK_LOG_DYNAMODB_TABLE_NAME": FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
2316
+ "USAGE_LOG_DYNAMODB_TABLE_NAME": USAGE_LOG_DYNAMODB_TABLE_NAME,
2317
+ }
2318
+
2319
+ # Write variables to .env file
2320
+ ensure_folder_exists(out_dir + "/")
2321
+ env_file_path = os.path.abspath(os.path.join(out_dir, "config.env"))
2322
+
2323
+ # It's good practice to ensure the file exists before calling set_key repeatedly.
2324
+ # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
2325
+ if not os.path.exists(env_file_path):
2326
+ with open(env_file_path, "w"):
2327
+ pass # Create empty file
2328
+
2329
+ for key, value in variables.items():
2330
+ set_key(env_file_path, key, str(value), quote_mode="never")
2331
+
2332
+ return variables
2333
+
2334
+
2335
+ def start_codebuild_build(PROJECT_NAME: str, AWS_REGION: str = AWS_REGION):
2336
+ """
2337
+ Start an existing Codebuild project build
2338
+ """
2339
+
2340
+ # --- Initialize CodeBuild client ---
2341
+ client = boto3.client("codebuild", region_name=AWS_REGION)
2342
+
2343
+ try:
2344
+ print(f"Attempting to start build for project: {PROJECT_NAME}")
2345
+
2346
+ response = client.start_build(projectName=PROJECT_NAME)
2347
+
2348
+ build_id = response["build"]["id"]
2349
+ print(f"Successfully started build with ID: {build_id}")
2350
+ print(f"Build ARN: {response['build']['arn']}")
2351
+ print("Build URL (approximate - construct based on region and ID):")
2352
+ print(
2353
+ f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail"
2354
+ )
2355
+
2356
+ # You can inspect the full response if needed
2357
+ # print("\nFull response:")
2358
+ # import json
2359
+ # print(json.dumps(response, indent=2))
2360
+
2361
+ except client.exceptions.ResourceNotFoundException:
2362
+ print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
2363
+ except Exception as e:
2364
+ print(f"An unexpected error occurred: {e}")
2365
+
2366
+
2367
+ def upload_file_to_s3(
2368
+ local_file_paths: List[str],
2369
+ s3_key: str,
2370
+ s3_bucket: str,
2371
+ RUN_AWS_FUNCTIONS: str = "1",
2372
+ ):
2373
+ """
2374
+ Uploads a file from local machine to Amazon S3.
2375
+
2376
+ Args:
2377
+ - local_file_path: Local file path(s) of the file(s) to upload.
2378
+ - s3_key: Key (path) to the file in the S3 bucket.
2379
+ - s3_bucket: Name of the S3 bucket.
2380
+
2381
+ Returns:
2382
+ - Message as variable/printed to console
2383
+ """
2384
+ final_out_message = []
2385
+ final_out_message_str = ""
2386
+
2387
+ if RUN_AWS_FUNCTIONS == "1":
2388
+ try:
2389
+ if s3_bucket and local_file_paths:
2390
+
2391
+ s3_client = boto3.client("s3", region_name=AWS_REGION)
2392
+
2393
+ if isinstance(local_file_paths, str):
2394
+ local_file_paths = [local_file_paths]
2395
+
2396
+ for file in local_file_paths:
2397
+ if s3_client:
2398
+ # print(s3_client)
2399
+ try:
2400
+ # Get file name off file path
2401
+ file_name = os.path.basename(file)
2402
+
2403
+ s3_key_full = s3_key + file_name
2404
+ print("S3 key: ", s3_key_full)
2405
+
2406
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
2407
+ out_message = (
2408
+ "File " + file_name + " uploaded successfully!"
2409
+ )
2410
+ print(out_message)
2411
+
2412
+ except Exception as e:
2413
+ out_message = f"Error uploading file(s): {e}"
2414
+ print(out_message)
2415
+
2416
+ final_out_message.append(out_message)
2417
+ final_out_message_str = "\n".join(final_out_message)
2418
+
2419
+ else:
2420
+ final_out_message_str = "Could not connect to AWS."
2421
+ else:
2422
+ final_out_message_str = (
2423
+ "At least one essential variable is empty, could not upload to S3"
2424
+ )
2425
+ except Exception as e:
2426
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
2427
+ print(final_out_message_str)
2428
+ else:
2429
+ final_out_message_str = "App not set to run AWS functions"
2430
+
2431
+ return final_out_message_str
2432
+
2433
+
2434
+ # Initialize ECS client
2435
+ def start_ecs_task(cluster_name, service_name):
2436
+ ecs_client = boto3.client("ecs")
2437
+
2438
+ try:
2439
+ # Update the service to set the desired count to 1
2440
+ ecs_client.update_service(
2441
+ cluster=cluster_name, service=service_name, desiredCount=1
2442
+ )
2443
+ return {
2444
+ "statusCode": 200,
2445
+ "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task.",
2446
+ }
2447
+ except Exception as e:
2448
+ return {"statusCode": 500, "body": f"Error updating service: {str(e)}"}