Moses Paul R
commited on
Commit
·
5aa8146
1
Parent(s):
974e8d0
reduce threshold and remove dead code [skip ci]
Browse files- marker/v2/builders/layout.py +1 -12
- marker/v2/converters/pdf.py +1 -1
marker/v2/builders/layout.py
CHANGED
|
@@ -110,22 +110,11 @@ class LayoutBuilder(BaseBuilder):
|
|
| 110 |
document_page.add_full_block(span)
|
| 111 |
line.add_structure(span)
|
| 112 |
|
| 113 |
-
for line_idx in provider_line_idxs.difference(assigned_line_idxs):
|
| 114 |
-
line = provider_lines[line_idx]
|
| 115 |
-
document_page.add_full_block(line)
|
| 116 |
-
text_block = document_page.add_block(Text, polygon=line.polygon)
|
| 117 |
-
document_page.add_structure(text_block)
|
| 118 |
-
text_block.text_extraction_method = "pdftext"
|
| 119 |
-
text_block.add_structure(line)
|
| 120 |
-
for span in line_spans[line_idx]:
|
| 121 |
-
document_page.add_full_block(span)
|
| 122 |
-
text_block.add_structure(span)
|
| 123 |
-
|
| 124 |
def check_layout_coverage(
|
| 125 |
self,
|
| 126 |
document_page: PageGroup,
|
| 127 |
provider_lines: List[Line],
|
| 128 |
-
coverage_threshold=0.
|
| 129 |
):
|
| 130 |
layout_area = 0
|
| 131 |
provider_area = 0
|
|
|
|
| 110 |
document_page.add_full_block(span)
|
| 111 |
line.add_structure(span)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def check_layout_coverage(
|
| 114 |
self,
|
| 115 |
document_page: PageGroup,
|
| 116 |
provider_lines: List[Line],
|
| 117 |
+
coverage_threshold=0.5
|
| 118 |
):
|
| 119 |
layout_area = 0
|
| 120 |
provider_area = 0
|
marker/v2/converters/pdf.py
CHANGED
|
@@ -29,7 +29,7 @@ class PdfConverter(BaseConverter):
|
|
| 29 |
self.detection_model = setup_detection_model()
|
| 30 |
|
| 31 |
def __call__(self, filepath: str, page_range: List[int] | None = None):
|
| 32 |
-
pdf_provider = PdfProvider(filepath, {"page_range": page_range})
|
| 33 |
|
| 34 |
layout_builder = LayoutBuilder(self.layout_model)
|
| 35 |
ocr_builder = OcrBuilder(self.detection_model, self.recognition_model)
|
|
|
|
| 29 |
self.detection_model = setup_detection_model()
|
| 30 |
|
| 31 |
def __call__(self, filepath: str, page_range: List[int] | None = None):
|
| 32 |
+
pdf_provider = PdfProvider(filepath, {"page_range": page_range, "force_ocr": False})
|
| 33 |
|
| 34 |
layout_builder = LayoutBuilder(self.layout_model)
|
| 35 |
ocr_builder = OcrBuilder(self.detection_model, self.recognition_model)
|