Ahadhassan-2003
deploy: update HF Space
dc4e6da
import pathlib
from typing import Literal
from docgenie import ENV
class SyntheticDatasetFileStructure:
def __init__(self, ds_name: str):
self.ds_name = ds_name
self.prompt_batches_directory.mkdir(parents=True, exist_ok=True)
self.message_results_directory.mkdir(parents=True, exist_ok=True)
self.preprocessed_seed_images_directory.mkdir(parents=True, exist_ok=True)
self.message_processing_logs_directory.mkdir(parents=True, exist_ok=True)
self.raw_html_directory.mkdir(parents=True, exist_ok=True)
self.render_html_directory.mkdir(parents=True, exist_ok=True)
self.render_html_second_pass_directory.mkdir(parents=True, exist_ok=True)
self.geometries_directory.mkdir(parents=True, exist_ok=True)
self.raw_annotations_directory.mkdir(parents=True, exist_ok=True)
self.pdf_initial_directory.mkdir(parents=True, exist_ok=True)
self.pdf_with_handwriting_directory.mkdir(parents=True, exist_ok=True)
self.pdf_without_handwriting_placeholder_directory.mkdir(
parents=True, exist_ok=True
)
self.final_pdf_directory.mkdir(parents=True, exist_ok=True)
self.bboxes_pdf_directory.mkdir(parents=True, exist_ok=True)
self.bboxes_final_directory.mkdir(parents=True, exist_ok=True)
self.bboxes_final_normalized_directory.mkdir(parents=True, exist_ok=True)
self.ocr_results_directory.mkdir(parents=True, exist_ok=True)
self.img_directory.mkdir(parents=True, exist_ok=True)
self.gt_directory.mkdir(parents=True, exist_ok=True)
self.document_logs_directory.mkdir(parents=True, exist_ok=True)
self.handwritten_bboxes_directory.mkdir(parents=True, exist_ok=True)
self.visual_element_definitions_directory.mkdir(parents=True, exist_ok=True)
self.visual_elements_directory.mkdir(parents=True, exist_ok=True)
self.layout_element_definitions_directory.mkdir(parents=True, exist_ok=True)
# Directories for handwritten text images
self.handwritten_text_images_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_visual_elements_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_handwriting_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_layout_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_geometries_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_bboxes_final_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_bboxes_directory.mkdir(parents=True, exist_ok=True)
self.debug_pdf_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
self.debug_ocr_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
self.debug_html_raw_directory.mkdir(parents=True, exist_ok=True)
@property
def base_path(self) -> pathlib.Path:
return ENV.SYN_DATASETS_DIR / self.ds_name
@property
def ds_log_path(self) -> pathlib.Path:
return self.base_path / "dataset_log.json"
@property
def ds_csv_log_path(self)->pathlib.Path:
return self.base_path / "dataset_log.csv"
# Keep on reset
@property
def prompt_batches_directory(self) -> pathlib.Path:
return self.base_path / "logs" / "prompt_batches"
# Keep on reset
@property
def message_results_directory(self) -> pathlib.Path:
return self.base_path / "logs" / "prompt_message_results"
# Keep on reset
@property
def preprocessed_seed_images_directory(self) -> pathlib.Path:
return self.base_path / "preprocessed_seed_images"
@property
def message_processing_logs_directory(self) -> pathlib.Path:
return self.base_path / "logs" / "message_processing_logs"
@property
def _html_directory(self) -> pathlib.Path:
return self.base_path / "html"
@property
def raw_html_directory(self) -> pathlib.Path:
return self._html_directory / "raw_html"
@property
def render_html_directory(self) -> pathlib.Path:
return self._html_directory / "render_html_pass1"
@property
def render_html_second_pass_directory(self) -> pathlib.Path:
return self._html_directory / "render_html_pass2"
@property
def geometries_directory(self) -> pathlib.Path:
return self.base_path / "geometries"
@property
def _pdf_directory(self) -> pathlib.Path:
return self.base_path / "pdf"
@property
def pdf_initial_directory(self) -> pathlib.Path:
"""Contains PDFs with handwriting-html-text visible"""
return self._pdf_directory / "pdf_initial"
@property
def pdf_without_handwriting_placeholder_directory(self) -> pathlib.Path:
"""Contains PDFs with handwriting-html-text and visual element placeholders invisible"""
return self._pdf_directory / "pdf_without_handwriting_placeholder"
@property
def pdf_with_handwriting_directory(self) -> pathlib.Path:
"""Contains PDFs where Handwriting and Visual Elements are invisible
(need two render passes because transparent text is not included in PDF)"""
return self._pdf_directory / "pdf_with_handwriting"
@property
def final_pdf_directory(self) -> pathlib.Path:
"""Contains final PDFs with handwriting and visual elements"""
return self._pdf_directory / "pdf_final"
@property
def _bbox_directory(self) -> pathlib.Path:
return self.base_path / "bbox"
@property
def bboxes_pdf_directory(self) -> pathlib.Path:
"""Contains the bounding boxes which were extracted from the PDF."""
return self._bbox_directory / "bbox_pdf"
@property
def bboxes_final_directory(self) -> pathlib.Path:
"""For documents which contain handwriting or visual elements, this contains bounding boxes retrieved via OCR.
Otherwise contains the bounding boxes which were extracted from the PDF."""
return self._bbox_directory / "bbox_final"
@property
def bboxes_final_normalized_directory(self) -> pathlib.Path:
"""Contains the final bboxes but normalized to image size."""
return self._bbox_directory / "bbox_final_normalized"
@property
def ocr_results_directory(self) -> pathlib.Path:
"""Contains OCR results for documents which contain handwriting or visual elements"""
return self.base_path / "ocr_results"
@property
def img_directory(self) -> pathlib.Path:
return self.base_path / "img"
@property
def _annotations_directory(self) -> pathlib.Path:
return self.base_path / "annotations"
@property
def gt_directory(self) -> pathlib.Path:
return self._annotations_directory / "gt"
@property
def raw_annotations_directory(self) -> pathlib.Path:
return self._annotations_directory / "raw_annotations"
@property
def document_logs_directory(self) -> pathlib.Path:
return self.base_path / "logs" / "document_logs"
@property
def _handwriting_directory(self) -> pathlib.Path:
return self.base_path / "handwriting"
@property
def handwritten_bboxes_directory(self) -> pathlib.Path:
return self._handwriting_directory / "handwriting_bbox"
# Directories for handwritten text images
@property
def handwritten_text_images_directory(self) -> pathlib.Path:
return self._handwriting_directory / "handwriting_raw_tokens"
@property
def _visual_elements_directory(self) -> pathlib.Path:
return self.base_path / "visual_elements"
@property
def visual_element_definitions_directory(self) -> pathlib.Path:
return self._visual_elements_directory / "visual_element_definitions"
@property
def visual_elements_directory(self) -> pathlib.Path:
return self._visual_elements_directory / "visual_elements_images"
@property
def layout_element_definitions_directory(self) -> pathlib.Path:
return self.base_path / "layout_element_definitions"
@property
def _debug_directory(self) -> pathlib.Path:
return self.base_path / "debug"
@property
def debug_pdf_visual_elements_directory(self) -> pathlib.Path:
return self._debug_directory / "visual_elements"
@property
def debug_pdf_handwriting_directory(self) -> pathlib.Path:
return self._debug_directory / "handwriting"
@property
def debug_pdf_layout_directory(self) -> pathlib.Path:
return self._debug_directory / "layout"
@property
def debug_pdf_geometries_directory(self) -> pathlib.Path:
return self._debug_directory / "geometries"
@property
def debug_pdf_bboxes_final_directory(self) -> pathlib.Path:
return self._debug_directory / "bboxes_final"
@property
def debug_pdf_bboxes_directory(self) -> pathlib.Path:
return self._debug_directory / "bboxes"
@property
def debug_pdf_bboxes_and_geos_directory(self) -> pathlib.Path:
return self._debug_directory / "bboxes_and_geos"
@property
def debug_ocr_bboxes_and_geos_directory(self) -> pathlib.Path:
return self._debug_directory / "ocr_bboxes_and_geos"
@property
def debug_html_raw_directory(self) -> pathlib.Path:
return self._debug_directory / "html_raw"
def get_pdf_bbox_path(self, level: Literal["word", "char"], doc_id: str):
return self.bboxes_pdf_directory / level / f"{doc_id}.txt"
def get_final_bbox_path(self, level: Literal["word", "segment"], doc_id: str):
return self.bboxes_final_directory / level / f"{doc_id}.txt"
def get_final_normalized_bbox_path(
self, level: Literal["word", "segment"], doc_id: str
):
return self.bboxes_final_normalized_directory / level / f"{doc_id}.txt"