File size: 3,516 Bytes
5c6e6ce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | from __future__ import annotations
import os
os.environ.setdefault("PYTHONUTF8", "1")
import numpy as np
from tools.redaction_review import export_review_page_ocr_visualisation_for_gradio
def test_export_review_page_ocr_visualisation_writes_file(tmp_path):
page = {
"image": np.full((120, 160, 3), 255, dtype=np.uint8),
"boxes": [],
}
ocr_with_words = [
{
"page": 1,
"results": {
"line_1": {
"line": 1,
"text": "Hello world",
"words": [
{
"text": "Hello",
"bounding_box": (10, 10, 60, 30),
"conf": 95,
"model": "Textract",
},
{
"text": "world",
"bounding_box": (70, 10, 120, 30),
"conf": 85,
"model": "Textract",
},
],
}
},
}
]
out = export_review_page_ocr_visualisation_for_gradio(
page,
1,
ocr_with_words,
None,
"doc.pdf",
str(tmp_path),
)
assert out is not None
resolved_out = os.path.realpath(out)
safe_root = os.path.realpath(str(tmp_path))
assert os.path.commonpath([safe_root, resolved_out]) == safe_root
assert os.path.isfile(resolved_out)
assert "review_ocr_visualisations" in out.replace("\\", "/")
def test_export_review_page_ocr_visualisation_draws_text_for_normalized_boxes(tmp_path):
# Regression: some OCR pipelines provide bbox coords normalized to [0,1].
# The visualisation should scale these into pixel space and render text.
page = {
"image": np.full((120, 160, 3), 255, dtype=np.uint8),
"boxes": [],
}
ocr_with_words = [
{
"page": 1,
"results": {
"line_1": {
"line": 1,
"text": "Hello world",
"words": [
{
"text": "Hello",
"bounding_box": (0.10, 0.10, 0.40, 0.25),
"conf": 95,
"model": "Textract",
},
{
"text": "world",
"bounding_box": (0.45, 0.10, 0.80, 0.25),
"conf": 85,
"model": "Textract",
},
],
}
},
}
]
out = export_review_page_ocr_visualisation_for_gradio(
page,
1,
ocr_with_words,
None,
"doc.pdf",
str(tmp_path),
)
assert out is not None
resolved_out = os.path.realpath(out)
safe_root = os.path.realpath(str(tmp_path))
assert os.path.commonpath([safe_root, resolved_out]) == safe_root
assert os.path.isfile(resolved_out)
# Ensure there is non-white ink on the right-hand half (the text page).
from PIL import Image
img = Image.open(resolved_out).convert("RGB")
w, h = img.size
# Right half; skip a small top-left patch where only legend might appear.
crop = img.crop((w // 2 + 5, 5, w - 5, h - 5))
arr = np.asarray(crop)
assert (arr < 250).any()
|