| from dataclasses import asdict
|
| import json
|
| from PIL import Image
|
| import fitz
|
| from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
|
| from docgenie.generation.models import OCRBox, PipelineParameters, SynDatasetDefinition
|
| from docgenie.generation.models._bbox import LayoutBox
|
| from docgenie.generation.models._file import SyntheticDatasetFileStructure
|
| from docgenie.generation.models._log import SynDocumentLog
|
| from docgenie.generation.utils.bboxes import (
|
| read_syn_dataset_bboxes,
|
| save_bboxes,
|
| )
|
| from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization
|
| from docgenie.generation.utils.log import log_pipeline_level
|
| from docgenie.generation.utils.status import get_progress_bar
|
|
|
|
|
| def normalize_ocrbox(bbox: OCRBox, width_px, height_px):
|
| """
|
| Convert a bounding box from PDF points to normalized image coordinates.
|
| """
|
|
|
| x_min_px = bbox.x0
|
| y_min_px = bbox.y0
|
| x_max_px = bbox.x2
|
| y_max_px = bbox.y2
|
|
|
|
|
| img_w_px = width_px
|
| img_h_px = height_px
|
|
|
|
|
| x_min_norm = x_min_px / img_w_px
|
| y_min_norm = y_min_px / img_h_px
|
| x_max_norm = x_max_px / img_w_px
|
| y_max_norm = y_max_px / img_h_px
|
|
|
| return OCRBox(
|
| x0=x_min_norm,
|
| y0=y_min_norm,
|
| x2=x_max_norm,
|
| y2=y_max_norm,
|
| text=bbox.text,
|
| block_no=bbox.block_no,
|
| line_no=bbox.line_no,
|
| word_no=bbox.word_no,
|
| )
|
|
|
|
|
| def normalize_and_save_word_and_segment_bboxes(dsdef: SynDatasetDefinition, docid: str):
|
| dsfiles = dsdef.get_file_structure()
|
|
|
| width_px, height_px = get_document_size_for_bbox_unnormalization(docid=docid, dsfiles=dsfiles)
|
|
|
|
|
| bbox_word_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid)
|
| bbox_word = read_syn_dataset_bboxes(bbox_word_path)
|
| bbox_word_normalized = [
|
| normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
|
| for b in bbox_word
|
| ]
|
| bbox_word_normalized_path = dsfiles.get_final_normalized_bbox_path(
|
| level="word", doc_id=docid
|
| )
|
| save_bboxes(bboxes=bbox_word_normalized, bbox_path=bbox_word_normalized_path)
|
|
|
|
|
| bbox_segment_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid)
|
| bbox_segment = read_syn_dataset_bboxes(bbox_segment_path)
|
| bbox_segment_normalized = [
|
| normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
|
| for b in bbox_segment
|
| ]
|
| bbox_segment_normalized_path = dsfiles.get_final_normalized_bbox_path(
|
| level="segment", doc_id=docid
|
| )
|
| save_bboxes(bboxes=bbox_segment_normalized, bbox_path=bbox_segment_normalized_path)
|
|
|
|
|
| def normalize_layout_bboxes(dsdef: SynDatasetDefinition, docid: str):
|
| dsfiles = dsdef.get_file_structure()
|
|
|
| pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
|
| doc = fitz.open(pdf_path)
|
| page = doc[0]
|
| width_pt, height_pt = page.rect.width, page.rect.height
|
|
|
| raw_annotations_path = dsfiles.raw_annotations_directory / f"{docid}.json"
|
| data = json.loads(raw_annotations_path.read_text(encoding="utf-8"))
|
| layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data]
|
|
|
| layout_bboxes_normalized = [
|
| LayoutBox.normalize_to_pdf(
|
| b, width_pt=width_pt, height_pt=height_pt, dpi=PDF_DPI
|
| )
|
| for b in layout_bboxes
|
| ]
|
|
|
| boxes_dicts = [asdict(b) for b in layout_bboxes_normalized]
|
| gt_path = dsfiles.gt_directory / f"{docid}.json"
|
| gt_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8")
|
|
|
| doc.close()
|
|
|
|
|
| def pipeline_normalize_bboxes(params: PipelineParameters):
|
| log_pipeline_level()
|
|
|
| dsdef = params.dsdef
|
|
|
|
|
| valid_documents = []
|
| total_pdfs_count = 0
|
| for doclog in dsdef.get_document_logs():
|
| total_pdfs_count += 1
|
| if doclog.pdf_num_pages == 1 and doclog.ocr_found:
|
| valid_documents.append(doclog.document_id)
|
|
|
| print(f"Found {len(valid_documents)} documents valid for BBox normalization.")
|
|
|
| with get_progress_bar() as progress:
|
| task = progress.add_task(
|
| "[white]Normalizing BBoxes...", total=len(valid_documents)
|
| )
|
|
|
| for docid in valid_documents:
|
| normalize_and_save_word_and_segment_bboxes(dsdef=dsdef, docid=docid)
|
| progress.update(task, advance=1)
|
|
|
|
|
|
|
| valid_documents = []
|
| total_pdfs_count = 0
|
| for doclog in dsdef.get_document_logs():
|
| total_pdfs_count += 1
|
| if doclog.pdf_num_pages == 1 and doclog.layout_elements_num_elements > 0:
|
| valid_documents.append(doclog.document_id)
|
|
|
| print(
|
| f"Found {len(valid_documents)} documents valid for Layout BBox normalization."
|
| )
|
|
|
| with get_progress_bar() as progress:
|
| task = progress.add_task(
|
| "[white]Normalizing Layout BBoxes...", total=len(valid_documents)
|
| )
|
|
|
| for docid in valid_documents:
|
| normalize_layout_bboxes(dsdef=dsdef, docid=docid)
|
| progress.update(task, advance=1)
|
|
|