Spaces:
Running
Running
File size: 4,757 Bytes
3c2ee8d cb2a980 3c2ee8d 798fa5e 3c2ee8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | from pathlib import Path
PATCH_TEXT = Path("fix_streamlit_pdf_text_reinsert.py").read_text(encoding="utf-8")
DOCKERFILE_TEXT = Path("Dockerfile").read_text(encoding="utf-8")
def test_pdf_text_reinsert_helper_is_imported_and_used():
assert "from scrub_key_pdf_text_reinsert import reinsert_pdf_text_bytes" in PATCH_TEXT
assert "reinsert_pdf_text_bytes(" in PATCH_TEXT
assert "pdf_text_reinsert_file.getvalue()" in PATCH_TEXT
def test_pdf_text_reinsert_ui_labels_are_present():
for marker in [
"PDF-tekst terugzetten naar TXT",
"PDF-bestand terugzetten naar TXT",
"Upload een PDF-bestand met placeholders",
"Zet PDF-tekst lokaal terug",
"Herstelde TXT-tekst uit PDF",
"Download herstelde TXT uit PDF (.txt)",
"Controleverslag PDF-tekst terugzetten",
]:
assert marker in PATCH_TEXT
def test_pdf_text_reinsert_required_warnings_are_present():
for marker in [
"PDF-tekstextractie is niet altijd volledig",
"Opmaak, tabellen, kolommen, headers, footers en visuele volgorde kunnen verloren gaan",
"Deze functie maakt geen herstelde PDF",
"De uitvoer is alleen herstelde TXT-tekst",
"Scans of afbeelding-PDF’s worden niet ondersteund omdat OCR niet beschikbaar is",
"terugzetten herstelt originele gevoelige waarden",
"geen AI, geen cloudverwerking en geen OCR",
]:
assert marker in PATCH_TEXT
def test_pdf_text_reinsert_audit_fields_are_present():
for marker in [
"document_type",
"extracted_text_length",
"replacement_count",
"item_count",
"active_item_count",
"excluded_item_count",
"placeholders_not_found",
"unknown_placeholders",
"duplicate_placeholders",
"validation_issues",
"unsupported_reason",
"local_only",
"ai_processing",
"cloud_processing",
"ocr_used",
"pdf_output",
"Documenttype",
"Lengte geëxtraheerde tekst",
"Niet-ondersteund reden",
"Lokaal uitgevoerd",
"AI-verwerking",
"Cloudverwerking",
"OCR gebruikt",
"PDF-output",
]:
assert marker in PATCH_TEXT
def test_pdf_text_reinsert_shows_local_no_ai_no_cloud_no_ocr_no_pdf_output():
for marker in [
"Lokaal uitgevoerd: Ja",
"AI-verwerking: Nee",
"Cloudverwerking: Nee",
"OCR gebruikt: Nee",
"PDF-output: Nee",
]:
assert marker in PATCH_TEXT
def test_pdf_text_reinsert_accepts_pdf_only_and_requires_key():
assert 'type=["pdf"]' in PATCH_TEXT
assert "Laad eerst een geldige Scrub Key" in PATCH_TEXT
assert "Upload eerst een PDF-bestand met placeholders" in PATCH_TEXT
assert "active_pdf_text_reinsert_scrub_key" in PATCH_TEXT
assert 'st.session_state.get("active_scrub_key", {})' in PATCH_TEXT
def test_pdf_text_reinsert_unsupported_case_does_not_offer_successful_download():
assert "pdf_text_unsupported_reason" in PATCH_TEXT
assert "pdf_text_can_download = not pdf_text_validation_issues and not pdf_text_unsupported_reason" in PATCH_TEXT
assert "Geen bruikbare tekstlaag gevonden" in PATCH_TEXT
assert "Scans of afbeelding-PDF’s worden niet ondersteund" in PATCH_TEXT
assert "if pdf_text_can_download:" in PATCH_TEXT
def test_pdf_text_reinsert_is_inserted_before_anonymization_else_branch():
insert_marker = 'pdf_insert_marker = \'\'\''
assert insert_marker in PATCH_TEXT
assert "+ pdf_text_reinsert_ui_block" in PATCH_TEXT
assert "'''else:\n'''" in PATCH_TEXT
def test_dockerfile_runs_pdf_text_reinsert_patch_after_existing_patch():
assert "python fix_streamlit_nested_expanders.py && python fix_streamlit_pdf_text_reinsert.py" in DOCKERFILE_TEXT
def test_dockerfile_installs_runtime_pdf_parser_for_approved_ui_path():
assert "pypdf" in DOCKERFILE_TEXT
assert "poetry install --no-root" in DOCKERFILE_TEXT
def test_no_restored_pdf_ocr_cloud_ai_or_rehydration_behavior_added():
lower_patch = PATCH_TEXT.lower()
forbidden_markers = [
"download herstelde pdf",
"download_pdf_reinserted",
"pdf_to_docx",
"pytesseract",
"ocr_used = true",
"requests.post",
"httpx.post",
"cloud processing call",
"restore_original_document",
"automatic pdf rehydration",
"server-side key storage",
"durable key vault",
"openai",
"anthropic",
]
for marker in forbidden_markers:
assert marker not in lower_patch
assert "st.stop()" not in PATCH_TEXT
assert "blocks_export = True" not in PATCH_TEXT
assert "changes_export_semantics = True" not in PATCH_TEXT
|