from dataclasses import asdict import json from PIL import Image import fitz from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI from docgenie.generation.models import OCRBox, PipelineParameters, SynDatasetDefinition from docgenie.generation.models._bbox import LayoutBox from docgenie.generation.models._file import SyntheticDatasetFileStructure from docgenie.generation.models._log import SynDocumentLog from docgenie.generation.utils.bboxes import ( read_syn_dataset_bboxes, save_bboxes, ) from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization from docgenie.generation.utils.log import log_pipeline_level from docgenie.generation.utils.status import get_progress_bar def normalize_ocrbox(bbox: OCRBox, width_px, height_px): """ Convert a bounding box from PDF points to normalized image coordinates. """ # Convert PDF points to pixels x_min_px = bbox.x0 y_min_px = bbox.y0 x_max_px = bbox.x2 y_max_px = bbox.y2 # Get image size in pixels img_w_px = width_px img_h_px = height_px # Normalize bounding box x_min_norm = x_min_px / img_w_px y_min_norm = y_min_px / img_h_px x_max_norm = x_max_px / img_w_px y_max_norm = y_max_px / img_h_px return OCRBox( x0=x_min_norm, y0=y_min_norm, x2=x_max_norm, y2=y_max_norm, text=bbox.text, block_no=bbox.block_no, line_no=bbox.line_no, word_no=bbox.word_no, ) def normalize_and_save_word_and_segment_bboxes(dsdef: SynDatasetDefinition, docid: str): dsfiles = dsdef.get_file_structure() width_px, height_px = get_document_size_for_bbox_unnormalization(docid=docid, dsfiles=dsfiles) # word bbox_word_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid) bbox_word = read_syn_dataset_bboxes(bbox_word_path) bbox_word_normalized = [ normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px) for b in bbox_word ] bbox_word_normalized_path = dsfiles.get_final_normalized_bbox_path( level="word", doc_id=docid ) save_bboxes(bboxes=bbox_word_normalized, bbox_path=bbox_word_normalized_path) # segment bbox_segment_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid) bbox_segment = read_syn_dataset_bboxes(bbox_segment_path) bbox_segment_normalized = [ normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px) for b in bbox_segment ] bbox_segment_normalized_path = dsfiles.get_final_normalized_bbox_path( level="segment", doc_id=docid ) save_bboxes(bboxes=bbox_segment_normalized, bbox_path=bbox_segment_normalized_path) def normalize_layout_bboxes(dsdef: SynDatasetDefinition, docid: str): dsfiles = dsdef.get_file_structure() pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf" doc = fitz.open(pdf_path) page = doc[0] width_pt, height_pt = page.rect.width, page.rect.height raw_annotations_path = dsfiles.raw_annotations_directory / f"{docid}.json" data = json.loads(raw_annotations_path.read_text(encoding="utf-8")) layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data] layout_bboxes_normalized = [ LayoutBox.normalize_to_pdf( b, width_pt=width_pt, height_pt=height_pt, dpi=PDF_DPI ) for b in layout_bboxes ] boxes_dicts = [asdict(b) for b in layout_bboxes_normalized] gt_path = dsfiles.gt_directory / f"{docid}.json" gt_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8") doc.close() def pipeline_normalize_bboxes(params: PipelineParameters): log_pipeline_level() dsdef = params.dsdef # Get documents valid for bbox normalization valid_documents = [] total_pdfs_count = 0 for doclog in dsdef.get_document_logs(): total_pdfs_count += 1 if doclog.pdf_num_pages == 1 and doclog.ocr_found: valid_documents.append(doclog.document_id) print(f"Found {len(valid_documents)} documents valid for BBox normalization.") with get_progress_bar() as progress: task = progress.add_task( "[white]Normalizing BBoxes...", total=len(valid_documents) ) for docid in valid_documents: normalize_and_save_word_and_segment_bboxes(dsdef=dsdef, docid=docid) progress.update(task, advance=1) # We also normalize the DLA GT here as they are layout BBoxes # Get documents valid for layout bbox normalization valid_documents = [] total_pdfs_count = 0 for doclog in dsdef.get_document_logs(): total_pdfs_count += 1 if doclog.pdf_num_pages == 1 and doclog.layout_elements_num_elements > 0: valid_documents.append(doclog.document_id) print( f"Found {len(valid_documents)} documents valid for Layout BBox normalization." ) with get_progress_bar() as progress: task = progress.add_task( "[white]Normalizing Layout BBoxes...", total=len(valid_documents) ) for docid in valid_documents: normalize_layout_bboxes(dsdef=dsdef, docid=docid) progress.update(task, advance=1)