File size: 3,516 Bytes
5c6e6ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from __future__ import annotations

import os

os.environ.setdefault("PYTHONUTF8", "1")

import numpy as np

from tools.redaction_review import export_review_page_ocr_visualisation_for_gradio


def test_export_review_page_ocr_visualisation_writes_file(tmp_path):
    page = {
        "image": np.full((120, 160, 3), 255, dtype=np.uint8),
        "boxes": [],
    }
    ocr_with_words = [
        {
            "page": 1,
            "results": {
                "line_1": {
                    "line": 1,
                    "text": "Hello world",
                    "words": [
                        {
                            "text": "Hello",
                            "bounding_box": (10, 10, 60, 30),
                            "conf": 95,
                            "model": "Textract",
                        },
                        {
                            "text": "world",
                            "bounding_box": (70, 10, 120, 30),
                            "conf": 85,
                            "model": "Textract",
                        },
                    ],
                }
            },
        }
    ]

    out = export_review_page_ocr_visualisation_for_gradio(
        page,
        1,
        ocr_with_words,
        None,
        "doc.pdf",
        str(tmp_path),
    )
    assert out is not None
    resolved_out = os.path.realpath(out)
    safe_root = os.path.realpath(str(tmp_path))
    assert os.path.commonpath([safe_root, resolved_out]) == safe_root
    assert os.path.isfile(resolved_out)
    assert "review_ocr_visualisations" in out.replace("\\", "/")


def test_export_review_page_ocr_visualisation_draws_text_for_normalized_boxes(tmp_path):
    # Regression: some OCR pipelines provide bbox coords normalized to [0,1].
    # The visualisation should scale these into pixel space and render text.
    page = {
        "image": np.full((120, 160, 3), 255, dtype=np.uint8),
        "boxes": [],
    }
    ocr_with_words = [
        {
            "page": 1,
            "results": {
                "line_1": {
                    "line": 1,
                    "text": "Hello world",
                    "words": [
                        {
                            "text": "Hello",
                            "bounding_box": (0.10, 0.10, 0.40, 0.25),
                            "conf": 95,
                            "model": "Textract",
                        },
                        {
                            "text": "world",
                            "bounding_box": (0.45, 0.10, 0.80, 0.25),
                            "conf": 85,
                            "model": "Textract",
                        },
                    ],
                }
            },
        }
    ]

    out = export_review_page_ocr_visualisation_for_gradio(
        page,
        1,
        ocr_with_words,
        None,
        "doc.pdf",
        str(tmp_path),
    )
    assert out is not None
    resolved_out = os.path.realpath(out)
    safe_root = os.path.realpath(str(tmp_path))
    assert os.path.commonpath([safe_root, resolved_out]) == safe_root
    assert os.path.isfile(resolved_out)

    # Ensure there is non-white ink on the right-hand half (the text page).
    from PIL import Image

    img = Image.open(resolved_out).convert("RGB")
    w, h = img.size
    # Right half; skip a small top-left patch where only legend might appear.
    crop = img.crop((w // 2 + 5, 5, w - 5, h - 5))
    arr = np.asarray(crop)
    assert (arr < 250).any()