Vik Paruchuri
commited on
Commit
·
2b12856
1
Parent(s):
98f060b
Bugfixes
Browse files- README.md +5 -6
- marker/builders/ocr.py +1 -1
- marker/processors/code.py +4 -0
- marker/processors/footnote.py +34 -17
- marker/schema/blocks/base.py +1 -1
README.md
CHANGED
|
@@ -25,10 +25,9 @@ It only uses models where necessary, which improves speed and accuracy.
|
|
| 25 |
## Examples
|
| 26 |
|
| 27 |
| PDF | Markdown | JSON |
|
| 28 |
-
| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/thinkpython.md) |
|
| 29 |
-
| [
|
| 30 |
-
| [
|
| 31 |
-
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/nougat/multicolcnn.md) |
|
| 32 |
|
| 33 |
## Performance
|
| 34 |
|
|
@@ -106,7 +105,7 @@ Options:
|
|
| 106 |
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
|
| 107 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
| 108 |
- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "eng,fra,deu"` for English, French, and German.
|
| 109 |
-
-
|
| 110 |
|
| 111 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
|
| 112 |
|
|
@@ -117,7 +116,7 @@ marker /path/to/input/folder --workers 10
|
|
| 117 |
```
|
| 118 |
|
| 119 |
- `marker` supports all the same options from `marker_single` above.
|
| 120 |
-
- `--workers` is the number of conversion workers to run simultaneously. This is set to
|
| 121 |
|
| 122 |
## Convert multiple files on multiple GPUs
|
| 123 |
|
|
|
|
| 25 |
## Examples
|
| 26 |
|
| 27 |
| PDF | Markdown | JSON |
|
| 28 |
+
| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/thinkpython/thinkpython.md) | |
|
| 29 |
+
| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/switch_transformers/switch_transformers.md) |
|
| 30 |
+
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/switch_transformers/multicolcnn.md) |
|
|
|
|
| 31 |
|
| 32 |
## Performance
|
| 33 |
|
|
|
|
| 105 |
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
|
| 106 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
| 107 |
- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "eng,fra,deu"` for English, French, and German.
|
| 108 |
+
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
|
| 109 |
|
| 110 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
|
| 111 |
|
|
|
|
| 116 |
```
|
| 117 |
|
| 118 |
- `marker` supports all the same options from `marker_single` above.
|
| 119 |
+
- `--workers` is the number of conversion workers to run simultaneously. This is set to 5 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
|
| 120 |
|
| 121 |
## Convert multiple files on multiple GPUs
|
| 122 |
|
marker/builders/ocr.py
CHANGED
|
@@ -71,7 +71,7 @@ class OcrBuilder(BaseBuilder):
|
|
| 71 |
det_processor=self.detection_model.processor,
|
| 72 |
rec_model=self.recognition_model,
|
| 73 |
rec_processor=self.recognition_model.processor,
|
| 74 |
-
|
| 75 |
highres_images=[page.highres_image for page in page_list]
|
| 76 |
)
|
| 77 |
|
|
|
|
| 71 |
det_processor=self.detection_model.processor,
|
| 72 |
rec_model=self.recognition_model,
|
| 73 |
rec_processor=self.recognition_model.processor,
|
| 74 |
+
recognition_batch_size=int(self.get_recognition_batch_size()),
|
| 75 |
highres_images=[page.highres_image for page in page_list]
|
| 76 |
)
|
| 77 |
|
marker/processors/code.py
CHANGED
|
@@ -19,6 +19,10 @@ class CodeProcessor(BaseProcessor):
|
|
| 19 |
min_left = 9999 # will contain x- coord of column 0
|
| 20 |
total_width = 0
|
| 21 |
total_chars = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
for line_id in block.structure:
|
| 23 |
line = document.get_block(line_id)
|
| 24 |
min_left = min(line.polygon.bbox[0], min_left)
|
|
|
|
| 19 |
min_left = 9999 # will contain x- coord of column 0
|
| 20 |
total_width = 0
|
| 21 |
total_chars = 0
|
| 22 |
+
if block.structure is None:
|
| 23 |
+
block.code = ""
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
for line_id in block.structure:
|
| 27 |
line = document.get_block(line_id)
|
| 28 |
min_left = min(line.polygon.bbox[0], min_left)
|
marker/processors/footnote.py
CHANGED
|
@@ -15,45 +15,62 @@ from marker.schema.groups import PageGroup
|
|
| 15 |
class FootnoteProcessor(BaseProcessor):
|
| 16 |
"""
|
| 17 |
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
block_types = (BlockTypes.Footnote,)
|
| 20 |
-
page_bottom_threshold = .
|
| 21 |
-
|
| 22 |
-
|
| 23 |
|
| 24 |
def __call__(self, document: Document):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
for page in document.pages:
|
| 26 |
-
self.relabel_texts_to_footnotes(page, document)
|
| 27 |
self.push_footnotes_to_bottom(page, document)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
def relabel_texts_to_footnotes(self, page: PageGroup, document: Document):
|
| 31 |
text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
|
| 32 |
block_stats = []
|
| 33 |
|
| 34 |
for block in text_blocks:
|
| 35 |
-
contained_spans = block.contained_blocks(document, (BlockTypes.Span,))
|
| 36 |
-
font_size = [span.font_size for span in contained_spans]
|
| 37 |
contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
|
| 38 |
line_heights = [line.polygon.height for line in contained_lines]
|
| 39 |
|
| 40 |
block_stats.append({
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"line_heights": line_heights,
|
| 44 |
-
"font_sizes": font_size,
|
| 45 |
-
"in_bottom_third": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
|
| 46 |
})
|
| 47 |
|
| 48 |
# Find the average font size and line height
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
|
|
|
| 52 |
for text_block, stats_dict in zip(text_blocks, block_stats):
|
| 53 |
if all([
|
| 54 |
-
stats_dict["
|
| 55 |
-
stats_dict["
|
| 56 |
-
stats_dict["in_bottom_third"]
|
| 57 |
]):
|
| 58 |
new_block = Footnote.from_block(text_block)
|
| 59 |
page.replace_block(text_block, new_block)
|
|
|
|
| 15 |
class FootnoteProcessor(BaseProcessor):
|
| 16 |
"""
|
| 17 |
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
|
| 18 |
+
|
| 19 |
+
Attributes:
|
| 20 |
+
page_bottom_threshold (float):
|
| 21 |
+
The fraction of page height that is considered the bottom.
|
| 22 |
+
Default is .75
|
| 23 |
+
|
| 24 |
+
line_height_scaler (float):
|
| 25 |
+
The amount to scale line height by to consider a block a footnote.
|
| 26 |
+
Default is .5
|
| 27 |
"""
|
| 28 |
block_types = (BlockTypes.Footnote,)
|
| 29 |
+
page_bottom_threshold = .75
|
| 30 |
+
line_height_scaler = .85
|
| 31 |
+
|
| 32 |
|
| 33 |
def __call__(self, document: Document):
|
| 34 |
+
footnote_heights = self.compute_block_stats(document)
|
| 35 |
+
if len(footnote_heights) == 0:
|
| 36 |
+
footnote_heights = [999]
|
| 37 |
+
|
| 38 |
+
avg_footnote_height = mean(footnote_heights)
|
| 39 |
for page in document.pages:
|
| 40 |
+
self.relabel_texts_to_footnotes(page, document, avg_footnote_height)
|
| 41 |
self.push_footnotes_to_bottom(page, document)
|
| 42 |
|
| 43 |
+
def compute_block_stats(self, document: Document):
|
| 44 |
+
line_heights = []
|
| 45 |
+
for page in document.pages:
|
| 46 |
+
for footnote in page.contained_blocks(document, self.block_types):
|
| 47 |
+
contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
|
| 48 |
+
line_heights.extend([line.polygon.height for line in contained_lines])
|
| 49 |
+
return line_heights
|
| 50 |
+
|
| 51 |
|
| 52 |
+
def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_footnote_height: int):
|
| 53 |
text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
|
| 54 |
block_stats = []
|
| 55 |
|
| 56 |
for block in text_blocks:
|
|
|
|
|
|
|
| 57 |
contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
|
| 58 |
line_heights = [line.polygon.height for line in contained_lines]
|
| 59 |
|
| 60 |
block_stats.append({
|
| 61 |
+
"line_height": mean(line_heights) if len(line_heights) > 0 else 999,
|
| 62 |
+
"in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
|
|
|
|
|
|
|
|
|
|
| 63 |
})
|
| 64 |
|
| 65 |
# Find the average font size and line height
|
| 66 |
+
if len(block_stats) == 0:
|
| 67 |
+
return
|
| 68 |
|
| 69 |
+
height_gap = 1 - self.line_height_scaler
|
| 70 |
for text_block, stats_dict in zip(text_blocks, block_stats):
|
| 71 |
if all([
|
| 72 |
+
avg_footnote_height * self.line_height_scaler < stats_dict["line_height"] < avg_footnote_height * (1 + height_gap),
|
| 73 |
+
stats_dict["in_bottom"]
|
|
|
|
| 74 |
]):
|
| 75 |
new_block = Footnote.from_block(text_block)
|
| 76 |
page.replace_block(text_block, new_block)
|
marker/schema/blocks/base.py
CHANGED
|
@@ -77,7 +77,7 @@ class Block(BaseModel):
|
|
| 77 |
|
| 78 |
@classmethod
|
| 79 |
def from_block(cls, block: Block) -> Block:
|
| 80 |
-
block_attrs = block.model_dump(exclude=["id", "block_id"])
|
| 81 |
return cls(**block_attrs)
|
| 82 |
|
| 83 |
def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
|
|
|
|
| 77 |
|
| 78 |
@classmethod
|
| 79 |
def from_block(cls, block: Block) -> Block:
|
| 80 |
+
block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
|
| 81 |
return cls(**block_attrs)
|
| 82 |
|
| 83 |
def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
|