#!/usr/bin/env python3 """ Setup script for GitHub Actions test data. Creates dummy test files when example data is not available. """ import os import sys import pandas as pd def create_directories(): """Create necessary directories.""" dirs = ["example_data", "example_data/example_outputs"] for dir_path in dirs: os.makedirs(dir_path, exist_ok=True) print(f"Created directory: {dir_path}") def create_dummy_pdf(): """Create dummy PDFs for testing.""" # Install reportlab if not available try: from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas except ImportError: import subprocess subprocess.check_call(["pip", "install", "reportlab"]) from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas try: # Create the main test PDF pdf_path = ( "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf" ) print(f"Creating PDF: {pdf_path}") print(f"Directory exists: {os.path.exists('example_data')}") c = canvas.Canvas(pdf_path, pagesize=letter) c.drawString(100, 750, "This is a test document for redaction testing.") c.drawString(100, 700, "Email: test@example.com") c.drawString(100, 650, "Phone: 123-456-7890") c.drawString(100, 600, "Name: John Doe") c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345") c.showPage() # Add second page c.drawString(100, 750, "Second page content") c.drawString(100, 700, "More test data: jane.doe@example.com") c.drawString(100, 650, "Another phone: 987-654-3210") c.save() print(f"Created dummy PDF: {pdf_path}") # Create Partnership Agreement Toolkit PDF partnership_pdf_path = "example_data/Partnership-Agreement-Toolkit_0_0.pdf" print(f"Creating PDF: {partnership_pdf_path}") c = canvas.Canvas(partnership_pdf_path, pagesize=letter) c.drawString(100, 750, "Partnership Agreement Toolkit") c.drawString(100, 700, "This is a test partnership agreement document.") c.drawString(100, 650, "Contact: partnership@example.com") c.drawString(100, 600, "Phone: (555) 123-4567") c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345") c.showPage() # Add second page c.drawString(100, 750, "Page 2 - Partnership Details") c.drawString(100, 700, "More partnership information here.") c.drawString(100, 650, "Contact: info@partnership.org") c.showPage() # Add third page c.drawString(100, 750, "Page 3 - Terms and Conditions") c.drawString(100, 700, "Terms and conditions content.") c.drawString(100, 650, "Legal contact: legal@partnership.org") c.save() print(f"Created dummy PDF: {partnership_pdf_path}") # Create Graduate Job Cover Letter PDF cover_letter_pdf_path = "example_data/graduate-job-example-cover-letter.pdf" print(f"Creating PDF: {cover_letter_pdf_path}") c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter) c.drawString(100, 750, "Cover Letter Example") c.drawString(100, 700, "Dear Hiring Manager,") c.drawString(100, 650, "I am writing to apply for the position.") c.drawString(100, 600, "Contact: applicant@example.com") c.drawString(100, 550, "Phone: (555) 987-6543") c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321") c.drawString(100, 450, "Sincerely,") c.drawString(100, 400, "John Applicant") c.save() print(f"Created dummy PDF: {cover_letter_pdf_path}") except ImportError: print("ReportLab not available, skipping PDF creation") # Create simple text files instead with open( "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf", "w", ) as f: f.write("This is a dummy PDF file for testing") with open( "example_data/Partnership-Agreement-Toolkit_0_0.pdf", "w", ) as f: f.write("This is a dummy Partnership Agreement PDF file for testing") with open( "example_data/graduate-job-example-cover-letter.pdf", "w", ) as f: f.write("This is a dummy cover letter PDF file for testing") print("Created dummy text files instead of PDFs") def create_dummy_csv(): """Create dummy CSV files for testing.""" # Main CSV csv_data = { "Case Note": [ "Client visited for consultation regarding housing issues", "Follow-up appointment scheduled for next week", "Documentation submitted for review", ], "Client": ["John Smith", "Jane Doe", "Bob Johnson"], "Date": ["2024-01-15", "2024-01-16", "2024-01-17"], } df = pd.DataFrame(csv_data) df.to_csv("example_data/combined_case_notes.csv", index=False) print("Created dummy CSV: example_data/combined_case_notes.csv") # Lambeth CSV lambeth_data = { "text": [ "Lambeth 2030 vision document content", "Our Future Our Lambeth strategic plan", "Community engagement and development", ], "page": [1, 2, 3], } df_lambeth = pd.DataFrame(lambeth_data) df_lambeth.to_csv( "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False ) print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv") def create_dummy_word_doc(): """Create dummy Word document.""" try: from docx import Document doc = Document() doc.add_heading("Test Document for Redaction", 0) doc.add_paragraph("This is a test document for redaction testing.") doc.add_paragraph("Contact Information:") doc.add_paragraph("Email: test@example.com") doc.add_paragraph("Phone: 123-456-7890") doc.add_paragraph("Name: John Doe") doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345") doc.save("example_data/Bold minimalist professional cover letter.docx") print("Created dummy Word document") except ImportError: print("python-docx not available, skipping Word document creation") def create_allow_deny_lists(): """Create dummy allow/deny lists.""" # Allow lists allow_data = {"word": ["test", "example", "document"]} pd.DataFrame(allow_data).to_csv( "example_data/test_allow_list_graduate.csv", index=False ) pd.DataFrame(allow_data).to_csv( "example_data/test_allow_list_partnership.csv", index=False ) print("Created allow lists") # Deny lists deny_data = {"word": ["sensitive", "confidential", "private"]} pd.DataFrame(deny_data).to_csv( "example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False ) pd.DataFrame(deny_data).to_csv( "example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv", index=False, ) print("Created deny lists") # Whole page redaction list page_data = {"page": [1, 2]} pd.DataFrame(page_data).to_csv( "example_data/partnership_toolkit_redact_some_pages.csv", index=False ) print("Created whole page redaction list") def create_ocr_output(): """Create dummy OCR output CSV.""" ocr_data = { "page": [1, 2, 3], "text": [ "This is page 1 content with some text", "This is page 2 content with different text", "This is page 3 content with more text", ], "left": [0.1, 0.3, 0.5], "top": [0.95, 0.92, 0.88], "width": [0.05, 0.02, 0.02], "height": [0.01, 0.02, 0.02], "line": [1, 2, 3], } df = pd.DataFrame(ocr_data) df.to_csv( "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv", index=False, ) print("Created dummy OCR output CSV") def create_dummy_image(): """Create dummy image for testing.""" try: from PIL import Image, ImageDraw, ImageFont img = Image.new("RGB", (800, 600), color="white") draw = ImageDraw.Draw(img) # Try to use a system font try: font = ImageFont.truetype( "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20 ) except Exception as e: print(f"Error loading DejaVuSans font: {e}") try: font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20) except Exception as e: print(f"Error loading Arial font: {e}") font = ImageFont.load_default() # Add text to image draw.text((50, 50), "Test Document for Redaction", fill="black", font=font) draw.text((50, 100), "Email: test@example.com", fill="black", font=font) draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font) draw.text((50, 200), "Name: John Doe", fill="black", font=font) draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font) img.save("example_data/example_complaint_letter.jpg") print("Created dummy image") except ImportError: print("PIL not available, skipping image creation") def main(): """Main setup function.""" print("Setting up test data for GitHub Actions...") print(f"Current working directory: {os.getcwd()}") print(f"Python version: {sys.version}") create_directories() create_dummy_pdf() create_dummy_csv() create_dummy_word_doc() create_allow_deny_lists() create_ocr_output() create_dummy_image() print("\nTest data setup complete!") print("Created files:") for root, dirs, files in os.walk("example_data"): for file in files: file_path = os.path.join(root, file) print(f" {file_path}") # Verify the file exists and has content if os.path.exists(file_path): file_size = os.path.getsize(file_path) print(f" Size: {file_size} bytes") else: print(" WARNING: File does not exist!") # Verify critical files exist critical_files = [ "example_data/Partnership-Agreement-Toolkit_0_0.pdf", "example_data/graduate-job-example-cover-letter.pdf", "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf", ] print("\nVerifying critical test files:") for file_path in critical_files: if os.path.exists(file_path): file_size = os.path.getsize(file_path) print(f"✅ {file_path} exists ({file_size} bytes)") else: print(f"❌ {file_path} MISSING!") if __name__ == "__main__": main()