Spaces:

seanpedrickcase
/

document_redaction

Running

File size: 3,516 Bytes

5c6e6ce

from __future__ import annotations

import os

os.environ.setdefault("PYTHONUTF8", "1")

import numpy as np

from tools.redaction_review import export_review_page_ocr_visualisation_for_gradio


def test_export_review_page_ocr_visualisation_writes_file(tmp_path):
    page = {
        "image": np.full((120, 160, 3), 255, dtype=np.uint8),
        "boxes": [],
    }
    ocr_with_words = [
        {
            "page": 1,
            "results": {
                "line_1": {
                    "line": 1,
                    "text": "Hello world",
                    "words": [
                        {
                            "text": "Hello",
                            "bounding_box": (10, 10, 60, 30),
                            "conf": 95,
                            "model": "Textract",
                        },
                        {
                            "text": "world",
                            "bounding_box": (70, 10, 120, 30),
                            "conf": 85,
                            "model": "Textract",
                        },
                    ],
                }
            },
        }
    ]

    out = export_review_page_ocr_visualisation_for_gradio(
        page,
        1,
        ocr_with_words,
        None,
        "doc.pdf",
        str(tmp_path),
    )
    assert out is not None
    resolved_out = os.path.realpath(out)
    safe_root = os.path.realpath(str(tmp_path))
    assert os.path.commonpath([safe_root, resolved_out]) == safe_root
    assert os.path.isfile(resolved_out)
    assert "review_ocr_visualisations" in out.replace("\\", "/")


def test_export_review_page_ocr_visualisation_draws_text_for_normalized_boxes(tmp_path):
    # Regression: some OCR pipelines provide bbox coords normalized to [0,1].
    # The visualisation should scale these into pixel space and render text.
    page = {
        "image": np.full((120, 160, 3), 255, dtype=np.uint8),
        "boxes": [],
    }
    ocr_with_words = [
        {
            "page": 1,
            "results": {
                "line_1": {
                    "line": 1,
                    "text": "Hello world",
                    "words": [
                        {
                            "text": "Hello",
                            "bounding_box": (0.10, 0.10, 0.40, 0.25),
                            "conf": 95,
                            "model": "Textract",
                        },
                        {
                            "text": "world",
                            "bounding_box": (0.45, 0.10, 0.80, 0.25),
                            "conf": 85,
                            "model": "Textract",
                        },
                    ],
                }
            },
        }
    ]

    out = export_review_page_ocr_visualisation_for_gradio(
        page,
        1,
        ocr_with_words,
        None,
        "doc.pdf",
        str(tmp_path),
    )
    assert out is not None
    resolved_out = os.path.realpath(out)
    safe_root = os.path.realpath(str(tmp_path))
    assert os.path.commonpath([safe_root, resolved_out]) == safe_root
    assert os.path.isfile(resolved_out)

    # Ensure there is non-white ink on the right-hand half (the text page).
    from PIL import Image

    img = Image.open(resolved_out).convert("RGB")
    w, h = img.size
    # Right half; skip a small top-left patch where only legend might appear.
    crop = img.crop((w // 2 + 5, 5, w - 5, h - 5))
    arr = np.asarray(crop)
    assert (arr < 250).any()