Spaces:

Text-to-Document-Generation
/

Docgenie-API

Paused

Docgenie-API / docgenie /generation /pipeline_16_normalize_bboxes.py

Ahadhassan-2003

deploy: update HF Space

dc4e6da about 1 month ago

5.38 kB

	from dataclasses import asdict
	import json
	from PIL import Image
	import fitz
	from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
	from docgenie.generation.models import OCRBox, PipelineParameters, SynDatasetDefinition
	from docgenie.generation.models._bbox import LayoutBox
	from docgenie.generation.models._file import SyntheticDatasetFileStructure
	from docgenie.generation.models._log import SynDocumentLog
	from docgenie.generation.utils.bboxes import (
	read_syn_dataset_bboxes,
	save_bboxes,
	)
	from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization
	from docgenie.generation.utils.log import log_pipeline_level
	from docgenie.generation.utils.status import get_progress_bar


	def normalize_ocrbox(bbox: OCRBox, width_px, height_px):
	"""
	Convert a bounding box from PDF points to normalized image coordinates.
	"""
	# Convert PDF points to pixels
	x_min_px = bbox.x0
	y_min_px = bbox.y0
	x_max_px = bbox.x2
	y_max_px = bbox.y2

	# Get image size in pixels
	img_w_px = width_px
	img_h_px = height_px

	# Normalize bounding box
	x_min_norm = x_min_px / img_w_px
	y_min_norm = y_min_px / img_h_px
	x_max_norm = x_max_px / img_w_px
	y_max_norm = y_max_px / img_h_px

	return OCRBox(
	x0=x_min_norm,
	y0=y_min_norm,
	x2=x_max_norm,
	y2=y_max_norm,
	text=bbox.text,
	block_no=bbox.block_no,
	line_no=bbox.line_no,
	word_no=bbox.word_no,
	)


	def normalize_and_save_word_and_segment_bboxes(dsdef: SynDatasetDefinition, docid: str):
	dsfiles = dsdef.get_file_structure()

	width_px, height_px = get_document_size_for_bbox_unnormalization(docid=docid, dsfiles=dsfiles)

	# word
	bbox_word_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid)
	bbox_word = read_syn_dataset_bboxes(bbox_word_path)
	bbox_word_normalized = [
	normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
	for b in bbox_word
	]
	bbox_word_normalized_path = dsfiles.get_final_normalized_bbox_path(
	level="word", doc_id=docid
	)
	save_bboxes(bboxes=bbox_word_normalized, bbox_path=bbox_word_normalized_path)

	# segment
	bbox_segment_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid)
	bbox_segment = read_syn_dataset_bboxes(bbox_segment_path)
	bbox_segment_normalized = [
	normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
	for b in bbox_segment
	]
	bbox_segment_normalized_path = dsfiles.get_final_normalized_bbox_path(
	level="segment", doc_id=docid
	)
	save_bboxes(bboxes=bbox_segment_normalized, bbox_path=bbox_segment_normalized_path)


	def normalize_layout_bboxes(dsdef: SynDatasetDefinition, docid: str):
	dsfiles = dsdef.get_file_structure()

	pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
	doc = fitz.open(pdf_path)
	page = doc[0]
	width_pt, height_pt = page.rect.width, page.rect.height

	raw_annotations_path = dsfiles.raw_annotations_directory / f"{docid}.json"
	data = json.loads(raw_annotations_path.read_text(encoding="utf-8"))
	layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data]

	layout_bboxes_normalized = [
	LayoutBox.normalize_to_pdf(
	b, width_pt=width_pt, height_pt=height_pt, dpi=PDF_DPI
	)
	for b in layout_bboxes
	]

	boxes_dicts = [asdict(b) for b in layout_bboxes_normalized]
	gt_path = dsfiles.gt_directory / f"{docid}.json"
	gt_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8")

	doc.close()


	def pipeline_normalize_bboxes(params: PipelineParameters):
	log_pipeline_level()

	dsdef = params.dsdef

	# Get documents valid for bbox normalization
	valid_documents = []
	total_pdfs_count = 0
	for doclog in dsdef.get_document_logs():
	total_pdfs_count += 1
	if doclog.pdf_num_pages == 1 and doclog.ocr_found:
	valid_documents.append(doclog.document_id)

	print(f"Found {len(valid_documents)} documents valid for BBox normalization.")

	with get_progress_bar() as progress:
	task = progress.add_task(
	"[white]Normalizing BBoxes...", total=len(valid_documents)
	)

	for docid in valid_documents:
	normalize_and_save_word_and_segment_bboxes(dsdef=dsdef, docid=docid)
	progress.update(task, advance=1)

	# We also normalize the DLA GT here as they are layout BBoxes
	# Get documents valid for layout bbox normalization
	valid_documents = []
	total_pdfs_count = 0
	for doclog in dsdef.get_document_logs():
	total_pdfs_count += 1
	if doclog.pdf_num_pages == 1 and doclog.layout_elements_num_elements > 0:
	valid_documents.append(doclog.document_id)

	print(
	f"Found {len(valid_documents)} documents valid for Layout BBox normalization."
	)

	with get_progress_bar() as progress:
	task = progress.add_task(
	"[white]Normalizing Layout BBoxes...", total=len(valid_documents)
	)

	for docid in valid_documents:
	normalize_layout_bboxes(dsdef=dsdef, docid=docid)
	progress.update(task, advance=1)