Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / test /test_review_ocr_visualisation_export.py

seanpedrickcase

Sync: Updated skills, docker compose. Minor fix to preview_boxes api route

d0ca26d 5 days ago

raw

history blame contribute delete

3.52 kB

	from __future__ import annotations

	import os

	os.environ.setdefault("PYTHONUTF8", "1")

	import numpy as np

	from tools.redaction_review import export_review_page_ocr_visualisation_for_gradio


	def test_export_review_page_ocr_visualisation_writes_file(tmp_path):
	page = {
	"image": np.full((120, 160, 3), 255, dtype=np.uint8),
	"boxes": [],
	}
	ocr_with_words = [
	{
	"page": 1,
	"results": {
	"line_1": {
	"line": 1,
	"text": "Hello world",
	"words": [
	{
	"text": "Hello",
	"bounding_box": (10, 10, 60, 30),
	"conf": 95,
	"model": "Textract",
	},
	{
	"text": "world",
	"bounding_box": (70, 10, 120, 30),
	"conf": 85,
	"model": "Textract",
	},
	],
	}
	},
	}
	]

	out = export_review_page_ocr_visualisation_for_gradio(
	page,
	1,
	ocr_with_words,
	None,
	"doc.pdf",
	str(tmp_path),
	)
	assert out is not None
	resolved_out = os.path.realpath(out)
	safe_root = os.path.realpath(str(tmp_path))
	assert os.path.commonpath([safe_root, resolved_out]) == safe_root
	assert os.path.isfile(resolved_out)
	assert "review_ocr_visualisations" in out.replace("\\", "/")


	def test_export_review_page_ocr_visualisation_draws_text_for_normalized_boxes(tmp_path):
	# Regression: some OCR pipelines provide bbox coords normalized to [0,1].
	# The visualisation should scale these into pixel space and render text.
	page = {
	"image": np.full((120, 160, 3), 255, dtype=np.uint8),
	"boxes": [],
	}
	ocr_with_words = [
	{
	"page": 1,
	"results": {
	"line_1": {
	"line": 1,
	"text": "Hello world",
	"words": [
	{
	"text": "Hello",
	"bounding_box": (0.10, 0.10, 0.40, 0.25),
	"conf": 95,
	"model": "Textract",
	},
	{
	"text": "world",
	"bounding_box": (0.45, 0.10, 0.80, 0.25),
	"conf": 85,
	"model": "Textract",
	},
	],
	}
	},
	}
	]

	out = export_review_page_ocr_visualisation_for_gradio(
	page,
	1,
	ocr_with_words,
	None,
	"doc.pdf",
	str(tmp_path),
	)
	assert out is not None
	resolved_out = os.path.realpath(out)
	safe_root = os.path.realpath(str(tmp_path))
	assert os.path.commonpath([safe_root, resolved_out]) == safe_root
	assert os.path.isfile(resolved_out)

	# Ensure there is non-white ink on the right-hand half (the text page).
	from PIL import Image

	img = Image.open(resolved_out).convert("RGB")
	w, h = img.size
	# Right half; skip a small top-left patch where only legend might appear.
	crop = img.crop((w // 2 + 5, 5, w - 5, h - 5))
	arr = np.asarray(crop)
	assert (arr < 250).any()