| import pathlib
|
| from typing import Literal
|
|
|
| from docgenie import ENV
|
|
|
|
|
| class SyntheticDatasetFileStructure:
|
| def __init__(self, ds_name: str):
|
| self.ds_name = ds_name
|
|
|
| self.prompt_batches_directory.mkdir(parents=True, exist_ok=True)
|
| self.message_results_directory.mkdir(parents=True, exist_ok=True)
|
| self.preprocessed_seed_images_directory.mkdir(parents=True, exist_ok=True)
|
| self.message_processing_logs_directory.mkdir(parents=True, exist_ok=True)
|
| self.raw_html_directory.mkdir(parents=True, exist_ok=True)
|
| self.render_html_directory.mkdir(parents=True, exist_ok=True)
|
| self.render_html_second_pass_directory.mkdir(parents=True, exist_ok=True)
|
| self.geometries_directory.mkdir(parents=True, exist_ok=True)
|
| self.raw_annotations_directory.mkdir(parents=True, exist_ok=True)
|
| self.pdf_initial_directory.mkdir(parents=True, exist_ok=True)
|
| self.pdf_with_handwriting_directory.mkdir(parents=True, exist_ok=True)
|
| self.pdf_without_handwriting_placeholder_directory.mkdir(
|
| parents=True, exist_ok=True
|
| )
|
| self.final_pdf_directory.mkdir(parents=True, exist_ok=True)
|
| self.bboxes_pdf_directory.mkdir(parents=True, exist_ok=True)
|
| self.bboxes_final_directory.mkdir(parents=True, exist_ok=True)
|
| self.bboxes_final_normalized_directory.mkdir(parents=True, exist_ok=True)
|
| self.ocr_results_directory.mkdir(parents=True, exist_ok=True)
|
| self.img_directory.mkdir(parents=True, exist_ok=True)
|
| self.gt_directory.mkdir(parents=True, exist_ok=True)
|
| self.document_logs_directory.mkdir(parents=True, exist_ok=True)
|
| self.handwritten_bboxes_directory.mkdir(parents=True, exist_ok=True)
|
| self.visual_element_definitions_directory.mkdir(parents=True, exist_ok=True)
|
| self.visual_elements_directory.mkdir(parents=True, exist_ok=True)
|
| self.layout_element_definitions_directory.mkdir(parents=True, exist_ok=True)
|
|
|
| self.handwritten_text_images_directory.mkdir(parents=True, exist_ok=True)
|
|
|
| self.debug_pdf_visual_elements_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_handwriting_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_layout_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_geometries_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_bboxes_final_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_bboxes_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_pdf_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_ocr_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
|
| self.debug_html_raw_directory.mkdir(parents=True, exist_ok=True)
|
|
|
| @property
|
| def base_path(self) -> pathlib.Path:
|
| return ENV.SYN_DATASETS_DIR / self.ds_name
|
|
|
| @property
|
| def ds_log_path(self) -> pathlib.Path:
|
| return self.base_path / "dataset_log.json"
|
|
|
| @property
|
| def ds_csv_log_path(self)->pathlib.Path:
|
| return self.base_path / "dataset_log.csv"
|
|
|
|
|
| @property
|
| def prompt_batches_directory(self) -> pathlib.Path:
|
| return self.base_path / "logs" / "prompt_batches"
|
|
|
|
|
| @property
|
| def message_results_directory(self) -> pathlib.Path:
|
| return self.base_path / "logs" / "prompt_message_results"
|
|
|
|
|
| @property
|
| def preprocessed_seed_images_directory(self) -> pathlib.Path:
|
| return self.base_path / "preprocessed_seed_images"
|
|
|
| @property
|
| def message_processing_logs_directory(self) -> pathlib.Path:
|
| return self.base_path / "logs" / "message_processing_logs"
|
|
|
| @property
|
| def _html_directory(self) -> pathlib.Path:
|
| return self.base_path / "html"
|
|
|
| @property
|
| def raw_html_directory(self) -> pathlib.Path:
|
| return self._html_directory / "raw_html"
|
|
|
| @property
|
| def render_html_directory(self) -> pathlib.Path:
|
| return self._html_directory / "render_html_pass1"
|
|
|
| @property
|
| def render_html_second_pass_directory(self) -> pathlib.Path:
|
| return self._html_directory / "render_html_pass2"
|
|
|
| @property
|
| def geometries_directory(self) -> pathlib.Path:
|
| return self.base_path / "geometries"
|
|
|
| @property
|
| def _pdf_directory(self) -> pathlib.Path:
|
| return self.base_path / "pdf"
|
|
|
| @property
|
| def pdf_initial_directory(self) -> pathlib.Path:
|
| """Contains PDFs with handwriting-html-text visible"""
|
| return self._pdf_directory / "pdf_initial"
|
|
|
| @property
|
| def pdf_without_handwriting_placeholder_directory(self) -> pathlib.Path:
|
| """Contains PDFs with handwriting-html-text and visual element placeholders invisible"""
|
| return self._pdf_directory / "pdf_without_handwriting_placeholder"
|
|
|
| @property
|
| def pdf_with_handwriting_directory(self) -> pathlib.Path:
|
| """Contains PDFs where Handwriting and Visual Elements are invisible
|
| (need two render passes because transparent text is not included in PDF)"""
|
| return self._pdf_directory / "pdf_with_handwriting"
|
|
|
| @property
|
| def final_pdf_directory(self) -> pathlib.Path:
|
| """Contains final PDFs with handwriting and visual elements"""
|
| return self._pdf_directory / "pdf_final"
|
|
|
| @property
|
| def _bbox_directory(self) -> pathlib.Path:
|
| return self.base_path / "bbox"
|
|
|
| @property
|
| def bboxes_pdf_directory(self) -> pathlib.Path:
|
| """Contains the bounding boxes which were extracted from the PDF."""
|
| return self._bbox_directory / "bbox_pdf"
|
|
|
| @property
|
| def bboxes_final_directory(self) -> pathlib.Path:
|
| """For documents which contain handwriting or visual elements, this contains bounding boxes retrieved via OCR.
|
| Otherwise contains the bounding boxes which were extracted from the PDF."""
|
| return self._bbox_directory / "bbox_final"
|
|
|
| @property
|
| def bboxes_final_normalized_directory(self) -> pathlib.Path:
|
| """Contains the final bboxes but normalized to image size."""
|
| return self._bbox_directory / "bbox_final_normalized"
|
|
|
| @property
|
| def ocr_results_directory(self) -> pathlib.Path:
|
| """Contains OCR results for documents which contain handwriting or visual elements"""
|
| return self.base_path / "ocr_results"
|
|
|
| @property
|
| def img_directory(self) -> pathlib.Path:
|
| return self.base_path / "img"
|
|
|
| @property
|
| def _annotations_directory(self) -> pathlib.Path:
|
| return self.base_path / "annotations"
|
|
|
| @property
|
| def gt_directory(self) -> pathlib.Path:
|
| return self._annotations_directory / "gt"
|
|
|
| @property
|
| def raw_annotations_directory(self) -> pathlib.Path:
|
| return self._annotations_directory / "raw_annotations"
|
|
|
| @property
|
| def document_logs_directory(self) -> pathlib.Path:
|
| return self.base_path / "logs" / "document_logs"
|
|
|
| @property
|
| def _handwriting_directory(self) -> pathlib.Path:
|
| return self.base_path / "handwriting"
|
|
|
| @property
|
| def handwritten_bboxes_directory(self) -> pathlib.Path:
|
| return self._handwriting_directory / "handwriting_bbox"
|
|
|
|
|
| @property
|
| def handwritten_text_images_directory(self) -> pathlib.Path:
|
| return self._handwriting_directory / "handwriting_raw_tokens"
|
|
|
| @property
|
| def _visual_elements_directory(self) -> pathlib.Path:
|
| return self.base_path / "visual_elements"
|
|
|
| @property
|
| def visual_element_definitions_directory(self) -> pathlib.Path:
|
| return self._visual_elements_directory / "visual_element_definitions"
|
|
|
| @property
|
| def visual_elements_directory(self) -> pathlib.Path:
|
| return self._visual_elements_directory / "visual_elements_images"
|
|
|
| @property
|
| def layout_element_definitions_directory(self) -> pathlib.Path:
|
| return self.base_path / "layout_element_definitions"
|
|
|
| @property
|
| def _debug_directory(self) -> pathlib.Path:
|
| return self.base_path / "debug"
|
|
|
| @property
|
| def debug_pdf_visual_elements_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "visual_elements"
|
|
|
| @property
|
| def debug_pdf_handwriting_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "handwriting"
|
|
|
| @property
|
| def debug_pdf_layout_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "layout"
|
|
|
| @property
|
| def debug_pdf_geometries_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "geometries"
|
|
|
| @property
|
| def debug_pdf_bboxes_final_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "bboxes_final"
|
|
|
| @property
|
| def debug_pdf_bboxes_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "bboxes"
|
|
|
| @property
|
| def debug_pdf_bboxes_and_geos_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "bboxes_and_geos"
|
|
|
| @property
|
| def debug_ocr_bboxes_and_geos_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "ocr_bboxes_and_geos"
|
|
|
| @property
|
| def debug_html_raw_directory(self) -> pathlib.Path:
|
| return self._debug_directory / "html_raw"
|
|
|
| def get_pdf_bbox_path(self, level: Literal["word", "char"], doc_id: str):
|
| return self.bboxes_pdf_directory / level / f"{doc_id}.txt"
|
|
|
| def get_final_bbox_path(self, level: Literal["word", "segment"], doc_id: str):
|
| return self.bboxes_final_directory / level / f"{doc_id}.txt"
|
|
|
| def get_final_normalized_bbox_path(
|
| self, level: Literal["word", "segment"], doc_id: str
|
| ):
|
| return self.bboxes_final_normalized_directory / level / f"{doc_id}.txt"
|
|
|