File size: 4,757 Bytes
3c2ee8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb2a980
3c2ee8d
 
 
 
 
 
798fa5e
 
 
 
 
3c2ee8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from pathlib import Path


PATCH_TEXT = Path("fix_streamlit_pdf_text_reinsert.py").read_text(encoding="utf-8")
DOCKERFILE_TEXT = Path("Dockerfile").read_text(encoding="utf-8")


def test_pdf_text_reinsert_helper_is_imported_and_used():
    assert "from scrub_key_pdf_text_reinsert import reinsert_pdf_text_bytes" in PATCH_TEXT
    assert "reinsert_pdf_text_bytes(" in PATCH_TEXT
    assert "pdf_text_reinsert_file.getvalue()" in PATCH_TEXT


def test_pdf_text_reinsert_ui_labels_are_present():
    for marker in [
        "PDF-tekst terugzetten naar TXT",
        "PDF-bestand terugzetten naar TXT",
        "Upload een PDF-bestand met placeholders",
        "Zet PDF-tekst lokaal terug",
        "Herstelde TXT-tekst uit PDF",
        "Download herstelde TXT uit PDF (.txt)",
        "Controleverslag PDF-tekst terugzetten",
    ]:
        assert marker in PATCH_TEXT


def test_pdf_text_reinsert_required_warnings_are_present():
    for marker in [
        "PDF-tekstextractie is niet altijd volledig",
        "Opmaak, tabellen, kolommen, headers, footers en visuele volgorde kunnen verloren gaan",
        "Deze functie maakt geen herstelde PDF",
        "De uitvoer is alleen herstelde TXT-tekst",
        "Scans of afbeelding-PDF’s worden niet ondersteund omdat OCR niet beschikbaar is",
        "terugzetten herstelt originele gevoelige waarden",
        "geen AI, geen cloudverwerking en geen OCR",
    ]:
        assert marker in PATCH_TEXT


def test_pdf_text_reinsert_audit_fields_are_present():
    for marker in [
        "document_type",
        "extracted_text_length",
        "replacement_count",
        "item_count",
        "active_item_count",
        "excluded_item_count",
        "placeholders_not_found",
        "unknown_placeholders",
        "duplicate_placeholders",
        "validation_issues",
        "unsupported_reason",
        "local_only",
        "ai_processing",
        "cloud_processing",
        "ocr_used",
        "pdf_output",
        "Documenttype",
        "Lengte geëxtraheerde tekst",
        "Niet-ondersteund reden",
        "Lokaal uitgevoerd",
        "AI-verwerking",
        "Cloudverwerking",
        "OCR gebruikt",
        "PDF-output",
    ]:
        assert marker in PATCH_TEXT


def test_pdf_text_reinsert_shows_local_no_ai_no_cloud_no_ocr_no_pdf_output():
    for marker in [
        "Lokaal uitgevoerd: Ja",
        "AI-verwerking: Nee",
        "Cloudverwerking: Nee",
        "OCR gebruikt: Nee",
        "PDF-output: Nee",
    ]:
        assert marker in PATCH_TEXT


def test_pdf_text_reinsert_accepts_pdf_only_and_requires_key():
    assert 'type=["pdf"]' in PATCH_TEXT
    assert "Laad eerst een geldige Scrub Key" in PATCH_TEXT
    assert "Upload eerst een PDF-bestand met placeholders" in PATCH_TEXT
    assert "active_pdf_text_reinsert_scrub_key" in PATCH_TEXT
    assert 'st.session_state.get("active_scrub_key", {})' in PATCH_TEXT


def test_pdf_text_reinsert_unsupported_case_does_not_offer_successful_download():
    assert "pdf_text_unsupported_reason" in PATCH_TEXT
    assert "pdf_text_can_download = not pdf_text_validation_issues and not pdf_text_unsupported_reason" in PATCH_TEXT
    assert "Geen bruikbare tekstlaag gevonden" in PATCH_TEXT
    assert "Scans of afbeelding-PDF’s worden niet ondersteund" in PATCH_TEXT
    assert "if pdf_text_can_download:" in PATCH_TEXT


def test_pdf_text_reinsert_is_inserted_before_anonymization_else_branch():
    insert_marker = 'pdf_insert_marker = \'\'\''
    assert insert_marker in PATCH_TEXT
    assert "+ pdf_text_reinsert_ui_block" in PATCH_TEXT
    assert "'''else:\n'''" in PATCH_TEXT


def test_dockerfile_runs_pdf_text_reinsert_patch_after_existing_patch():
    assert "python fix_streamlit_nested_expanders.py && python fix_streamlit_pdf_text_reinsert.py" in DOCKERFILE_TEXT


def test_dockerfile_installs_runtime_pdf_parser_for_approved_ui_path():
    assert "pypdf" in DOCKERFILE_TEXT
    assert "poetry install --no-root" in DOCKERFILE_TEXT


def test_no_restored_pdf_ocr_cloud_ai_or_rehydration_behavior_added():
    lower_patch = PATCH_TEXT.lower()
    forbidden_markers = [
        "download herstelde pdf",
        "download_pdf_reinserted",
        "pdf_to_docx",
        "pytesseract",
        "ocr_used = true",
        "requests.post",
        "httpx.post",
        "cloud processing call",
        "restore_original_document",
        "automatic pdf rehydration",
        "server-side key storage",
        "durable key vault",
        "openai",
        "anthropic",
    ]
    for marker in forbidden_markers:
        assert marker not in lower_patch
    assert "st.stop()" not in PATCH_TEXT
    assert "blocks_export = True" not in PATCH_TEXT
    assert "changes_export_semantics = True" not in PATCH_TEXT