File size: 10,066 Bytes
dc4e6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import pathlib
from typing import Literal

from docgenie import ENV


class SyntheticDatasetFileStructure:
    def __init__(self, ds_name: str):
        self.ds_name = ds_name

        self.prompt_batches_directory.mkdir(parents=True, exist_ok=True)
        self.message_results_directory.mkdir(parents=True, exist_ok=True)
        self.preprocessed_seed_images_directory.mkdir(parents=True, exist_ok=True)
        self.message_processing_logs_directory.mkdir(parents=True, exist_ok=True)
        self.raw_html_directory.mkdir(parents=True, exist_ok=True)
        self.render_html_directory.mkdir(parents=True, exist_ok=True)
        self.render_html_second_pass_directory.mkdir(parents=True, exist_ok=True)
        self.geometries_directory.mkdir(parents=True, exist_ok=True)
        self.raw_annotations_directory.mkdir(parents=True, exist_ok=True)
        self.pdf_initial_directory.mkdir(parents=True, exist_ok=True)
        self.pdf_with_handwriting_directory.mkdir(parents=True, exist_ok=True)
        self.pdf_without_handwriting_placeholder_directory.mkdir(
            parents=True, exist_ok=True
        )
        self.final_pdf_directory.mkdir(parents=True, exist_ok=True)
        self.bboxes_pdf_directory.mkdir(parents=True, exist_ok=True)
        self.bboxes_final_directory.mkdir(parents=True, exist_ok=True)
        self.bboxes_final_normalized_directory.mkdir(parents=True, exist_ok=True)
        self.ocr_results_directory.mkdir(parents=True, exist_ok=True)
        self.img_directory.mkdir(parents=True, exist_ok=True)
        self.gt_directory.mkdir(parents=True, exist_ok=True)
        self.document_logs_directory.mkdir(parents=True, exist_ok=True)
        self.handwritten_bboxes_directory.mkdir(parents=True, exist_ok=True)
        self.visual_element_definitions_directory.mkdir(parents=True, exist_ok=True)
        self.visual_elements_directory.mkdir(parents=True, exist_ok=True)
        self.layout_element_definitions_directory.mkdir(parents=True, exist_ok=True)
        # Directories for handwritten text images
        self.handwritten_text_images_directory.mkdir(parents=True, exist_ok=True)

        self.debug_pdf_visual_elements_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_handwriting_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_layout_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_geometries_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_bboxes_final_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_bboxes_directory.mkdir(parents=True, exist_ok=True)
        self.debug_pdf_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
        self.debug_ocr_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
        self.debug_html_raw_directory.mkdir(parents=True, exist_ok=True)

    @property
    def base_path(self) -> pathlib.Path:
        return ENV.SYN_DATASETS_DIR / self.ds_name

    @property
    def ds_log_path(self) -> pathlib.Path:
        return self.base_path / "dataset_log.json"
    
    @property
    def ds_csv_log_path(self)->pathlib.Path:
        return self.base_path / "dataset_log.csv"

    # Keep on reset
    @property
    def prompt_batches_directory(self) -> pathlib.Path:
        return self.base_path / "logs" / "prompt_batches"

    # Keep on reset
    @property
    def message_results_directory(self) -> pathlib.Path:
        return self.base_path / "logs" / "prompt_message_results"

    # Keep on reset
    @property
    def preprocessed_seed_images_directory(self) -> pathlib.Path:
        return self.base_path / "preprocessed_seed_images"

    @property
    def message_processing_logs_directory(self) -> pathlib.Path:
        return self.base_path / "logs" / "message_processing_logs"

    @property
    def _html_directory(self) -> pathlib.Path:
        return self.base_path / "html"

    @property
    def raw_html_directory(self) -> pathlib.Path:
        return self._html_directory / "raw_html"

    @property
    def render_html_directory(self) -> pathlib.Path:
        return self._html_directory / "render_html_pass1"

    @property
    def render_html_second_pass_directory(self) -> pathlib.Path:
        return self._html_directory / "render_html_pass2"

    @property
    def geometries_directory(self) -> pathlib.Path:
        return self.base_path / "geometries"

    @property
    def _pdf_directory(self) -> pathlib.Path:
        return self.base_path / "pdf"

    @property
    def pdf_initial_directory(self) -> pathlib.Path:
        """Contains PDFs with handwriting-html-text visible"""
        return self._pdf_directory / "pdf_initial"

    @property
    def pdf_without_handwriting_placeholder_directory(self) -> pathlib.Path:
        """Contains PDFs with handwriting-html-text and visual element placeholders invisible"""
        return self._pdf_directory / "pdf_without_handwriting_placeholder"

    @property
    def pdf_with_handwriting_directory(self) -> pathlib.Path:
        """Contains PDFs where Handwriting and Visual Elements are invisible

        (need two render passes because transparent text is not included in PDF)"""
        return self._pdf_directory / "pdf_with_handwriting"

    @property
    def final_pdf_directory(self) -> pathlib.Path:
        """Contains final PDFs with handwriting and visual elements"""
        return self._pdf_directory / "pdf_final"

    @property
    def _bbox_directory(self) -> pathlib.Path:
        return self.base_path / "bbox"

    @property
    def bboxes_pdf_directory(self) -> pathlib.Path:
        """Contains the bounding boxes which were extracted from the PDF."""
        return self._bbox_directory / "bbox_pdf"

    @property
    def bboxes_final_directory(self) -> pathlib.Path:
        """For documents which contain handwriting or visual elements, this contains bounding boxes retrieved via OCR.

        Otherwise contains the bounding boxes which were extracted from the PDF."""
        return self._bbox_directory / "bbox_final"

    @property
    def bboxes_final_normalized_directory(self) -> pathlib.Path:
        """Contains the final bboxes but normalized to image size."""
        return self._bbox_directory / "bbox_final_normalized"

    @property
    def ocr_results_directory(self) -> pathlib.Path:
        """Contains OCR results for documents which contain handwriting or visual elements"""
        return self.base_path / "ocr_results"

    @property
    def img_directory(self) -> pathlib.Path:
        return self.base_path / "img"

    @property
    def _annotations_directory(self) -> pathlib.Path:
        return self.base_path / "annotations"

    @property
    def gt_directory(self) -> pathlib.Path:
        return self._annotations_directory / "gt"

    @property
    def raw_annotations_directory(self) -> pathlib.Path:
        return self._annotations_directory / "raw_annotations"

    @property
    def document_logs_directory(self) -> pathlib.Path:
        return self.base_path / "logs" / "document_logs"

    @property
    def _handwriting_directory(self) -> pathlib.Path:
        return self.base_path / "handwriting"

    @property
    def handwritten_bboxes_directory(self) -> pathlib.Path:
        return self._handwriting_directory / "handwriting_bbox"

    # Directories for handwritten text images
    @property
    def handwritten_text_images_directory(self) -> pathlib.Path:
        return self._handwriting_directory / "handwriting_raw_tokens"

    @property
    def _visual_elements_directory(self) -> pathlib.Path:
        return self.base_path / "visual_elements"

    @property
    def visual_element_definitions_directory(self) -> pathlib.Path:
        return self._visual_elements_directory / "visual_element_definitions"

    @property
    def visual_elements_directory(self) -> pathlib.Path:
        return self._visual_elements_directory / "visual_elements_images"

    @property
    def layout_element_definitions_directory(self) -> pathlib.Path:
        return self.base_path / "layout_element_definitions"

    @property
    def _debug_directory(self) -> pathlib.Path:
        return self.base_path / "debug"

    @property
    def debug_pdf_visual_elements_directory(self) -> pathlib.Path:
        return self._debug_directory / "visual_elements"

    @property
    def debug_pdf_handwriting_directory(self) -> pathlib.Path:
        return self._debug_directory / "handwriting"

    @property
    def debug_pdf_layout_directory(self) -> pathlib.Path:
        return self._debug_directory / "layout"

    @property
    def debug_pdf_geometries_directory(self) -> pathlib.Path:
        return self._debug_directory / "geometries"

    @property
    def debug_pdf_bboxes_final_directory(self) -> pathlib.Path:
        return self._debug_directory / "bboxes_final"

    @property
    def debug_pdf_bboxes_directory(self) -> pathlib.Path:
        return self._debug_directory / "bboxes"

    @property
    def debug_pdf_bboxes_and_geos_directory(self) -> pathlib.Path:
        return self._debug_directory / "bboxes_and_geos"

    @property
    def debug_ocr_bboxes_and_geos_directory(self) -> pathlib.Path:
        return self._debug_directory / "ocr_bboxes_and_geos"

    @property
    def debug_html_raw_directory(self) -> pathlib.Path:
        return self._debug_directory / "html_raw"

    def get_pdf_bbox_path(self, level: Literal["word", "char"], doc_id: str):
        return self.bboxes_pdf_directory / level / f"{doc_id}.txt"

    def get_final_bbox_path(self, level: Literal["word", "segment"], doc_id: str):
        return self.bboxes_final_directory / level / f"{doc_id}.txt"

    def get_final_normalized_bbox_path(

        self, level: Literal["word", "segment"], doc_id: str

    ):
        return self.bboxes_final_normalized_directory / level / f"{doc_id}.txt"