seanpedrickcase's picture
Sync: Added Qwen3-VL-235B-A22B-Instruct to transformers models options
43bfad5
#!/usr/bin/env python3
"""
Setup script for GitHub Actions test data.
Creates dummy test files when example data is not available.
"""
import os
import sys
import pandas as pd
def create_directories():
"""Create necessary directories."""
dirs = ["example_data", "example_data/example_outputs"]
for dir_path in dirs:
os.makedirs(dir_path, exist_ok=True)
print(f"Created directory: {dir_path}")
def create_dummy_pdf():
"""Create dummy PDFs for testing."""
# Install reportlab if not available
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
except ImportError:
import subprocess
subprocess.check_call(["pip", "install", "reportlab"])
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
try:
# Create the main test PDF
pdf_path = (
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf"
)
print(f"Creating PDF: {pdf_path}")
print(f"Directory exists: {os.path.exists('example_data')}")
c = canvas.Canvas(pdf_path, pagesize=letter)
c.drawString(100, 750, "This is a test document for redaction testing.")
c.drawString(100, 700, "Email: test@example.com")
c.drawString(100, 650, "Phone: 123-456-7890")
c.drawString(100, 600, "Name: John Doe")
c.drawString(100, 550, "Address: 123 Test Street, Test City, TC 12345")
c.showPage()
# Add second page
c.drawString(100, 750, "Second page content")
c.drawString(100, 700, "More test data: jane.doe@example.com")
c.drawString(100, 650, "Another phone: 987-654-3210")
c.save()
print(f"Created dummy PDF: {pdf_path}")
# Create Partnership Agreement Toolkit PDF
partnership_pdf_path = "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
print(f"Creating PDF: {partnership_pdf_path}")
c = canvas.Canvas(partnership_pdf_path, pagesize=letter)
c.drawString(100, 750, "Partnership Agreement Toolkit")
c.drawString(100, 700, "This is a test partnership agreement document.")
c.drawString(100, 650, "Contact: partnership@example.com")
c.drawString(100, 600, "Phone: (555) 123-4567")
c.drawString(100, 550, "Address: 123 Partnership Street, City, State 12345")
c.showPage()
# Add second page
c.drawString(100, 750, "Page 2 - Partnership Details")
c.drawString(100, 700, "More partnership information here.")
c.drawString(100, 650, "Contact: info@partnership.org")
c.showPage()
# Add third page
c.drawString(100, 750, "Page 3 - Terms and Conditions")
c.drawString(100, 700, "Terms and conditions content.")
c.drawString(100, 650, "Legal contact: legal@partnership.org")
c.save()
print(f"Created dummy PDF: {partnership_pdf_path}")
# Create Graduate Job Cover Letter PDF
cover_letter_pdf_path = "example_data/graduate-job-example-cover-letter.pdf"
print(f"Creating PDF: {cover_letter_pdf_path}")
c = canvas.Canvas(cover_letter_pdf_path, pagesize=letter)
c.drawString(100, 750, "Cover Letter Example")
c.drawString(100, 700, "Dear Hiring Manager,")
c.drawString(100, 650, "I am writing to apply for the position.")
c.drawString(100, 600, "Contact: applicant@example.com")
c.drawString(100, 550, "Phone: (555) 987-6543")
c.drawString(100, 500, "Address: 456 Job Street, Employment City, EC 54321")
c.drawString(100, 450, "Sincerely,")
c.drawString(100, 400, "John Applicant")
c.save()
print(f"Created dummy PDF: {cover_letter_pdf_path}")
except ImportError:
print("ReportLab not available, skipping PDF creation")
# Create simple text files instead
with open(
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
"w",
) as f:
f.write("This is a dummy PDF file for testing")
with open(
"example_data/Partnership-Agreement-Toolkit_0_0.pdf",
"w",
) as f:
f.write("This is a dummy Partnership Agreement PDF file for testing")
with open(
"example_data/graduate-job-example-cover-letter.pdf",
"w",
) as f:
f.write("This is a dummy cover letter PDF file for testing")
print("Created dummy text files instead of PDFs")
def create_dummy_csv():
"""Create dummy CSV files for testing."""
# Main CSV
csv_data = {
"Case Note": [
"Client visited for consultation regarding housing issues",
"Follow-up appointment scheduled for next week",
"Documentation submitted for review",
],
"Client": ["John Smith", "Jane Doe", "Bob Johnson"],
"Date": ["2024-01-15", "2024-01-16", "2024-01-17"],
}
df = pd.DataFrame(csv_data)
df.to_csv("example_data/combined_case_notes.csv", index=False)
print("Created dummy CSV: example_data/combined_case_notes.csv")
# Lambeth CSV
lambeth_data = {
"text": [
"Lambeth 2030 vision document content",
"Our Future Our Lambeth strategic plan",
"Community engagement and development",
],
"page": [1, 2, 3],
}
df_lambeth = pd.DataFrame(lambeth_data)
df_lambeth.to_csv(
"example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv", index=False
)
print("Created dummy CSV: example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv")
def create_dummy_word_doc():
"""Create dummy Word document."""
try:
from docx import Document
doc = Document()
doc.add_heading("Test Document for Redaction", 0)
doc.add_paragraph("This is a test document for redaction testing.")
doc.add_paragraph("Contact Information:")
doc.add_paragraph("Email: test@example.com")
doc.add_paragraph("Phone: 123-456-7890")
doc.add_paragraph("Name: John Doe")
doc.add_paragraph("Address: 123 Test Street, Test City, TC 12345")
doc.save("example_data/Bold minimalist professional cover letter.docx")
print("Created dummy Word document")
except ImportError:
print("python-docx not available, skipping Word document creation")
def create_allow_deny_lists():
"""Create dummy allow/deny lists."""
# Allow lists
allow_data = {"word": ["test", "example", "document"]}
pd.DataFrame(allow_data).to_csv(
"example_data/test_allow_list_graduate.csv", index=False
)
pd.DataFrame(allow_data).to_csv(
"example_data/test_allow_list_partnership.csv", index=False
)
print("Created allow lists")
# Deny lists
deny_data = {"word": ["sensitive", "confidential", "private"]}
pd.DataFrame(deny_data).to_csv(
"example_data/partnership_toolkit_redact_custom_deny_list.csv", index=False
)
pd.DataFrame(deny_data).to_csv(
"example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
index=False,
)
print("Created deny lists")
# Whole page redaction list
page_data = {"page": [1, 2]}
pd.DataFrame(page_data).to_csv(
"example_data/partnership_toolkit_redact_some_pages.csv", index=False
)
print("Created whole page redaction list")
def create_ocr_output():
"""Create dummy OCR output CSV."""
ocr_data = {
"page": [1, 2, 3],
"text": [
"This is page 1 content with some text",
"This is page 2 content with different text",
"This is page 3 content with more text",
],
"left": [0.1, 0.3, 0.5],
"top": [0.95, 0.92, 0.88],
"width": [0.05, 0.02, 0.02],
"height": [0.01, 0.02, 0.02],
"line": [1, 2, 3],
}
df = pd.DataFrame(ocr_data)
df.to_csv(
"example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
index=False,
)
print("Created dummy OCR output CSV")
def create_dummy_image():
"""Create dummy image for testing."""
try:
from PIL import Image, ImageDraw, ImageFont
img = Image.new("RGB", (800, 600), color="white")
draw = ImageDraw.Draw(img)
# Try to use a system font
try:
font = ImageFont.truetype(
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20
)
except Exception as e:
print(f"Error loading DejaVuSans font: {e}")
try:
font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 20)
except Exception as e:
print(f"Error loading Arial font: {e}")
font = ImageFont.load_default()
# Add text to image
draw.text((50, 50), "Test Document for Redaction", fill="black", font=font)
draw.text((50, 100), "Email: test@example.com", fill="black", font=font)
draw.text((50, 150), "Phone: 123-456-7890", fill="black", font=font)
draw.text((50, 200), "Name: John Doe", fill="black", font=font)
draw.text((50, 250), "Address: 123 Test Street", fill="black", font=font)
img.save("example_data/example_complaint_letter.jpg")
print("Created dummy image")
except ImportError:
print("PIL not available, skipping image creation")
def main():
"""Main setup function."""
print("Setting up test data for GitHub Actions...")
print(f"Current working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")
create_directories()
create_dummy_pdf()
create_dummy_csv()
create_dummy_word_doc()
create_allow_deny_lists()
create_ocr_output()
create_dummy_image()
print("\nTest data setup complete!")
print("Created files:")
for root, dirs, files in os.walk("example_data"):
for file in files:
file_path = os.path.join(root, file)
print(f" {file_path}")
# Verify the file exists and has content
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f" Size: {file_size} bytes")
else:
print(" WARNING: File does not exist!")
# Verify critical files exist
critical_files = [
"example_data/Partnership-Agreement-Toolkit_0_0.pdf",
"example_data/graduate-job-example-cover-letter.pdf",
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
]
print("\nVerifying critical test files:")
for file_path in critical_files:
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f"✅ {file_path} exists ({file_size} bytes)")
else:
print(f"❌ {file_path} MISSING!")
if __name__ == "__main__":
main()