Vik Paruchuri commited on
Commit
2b12856
·
1 Parent(s): 98f060b
README.md CHANGED
@@ -25,10 +25,9 @@ It only uses models where necessary, which improves speed and accuracy.
25
  ## Examples
26
 
27
  | PDF | Markdown | JSON |
28
- | [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/thinkpython.md) |
29
- | [Think OS](https://greenteapress.com/thinkos/thinkos.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/thinkos.md) |
30
- | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/nougat/switch_transformers.md) |
31
- | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/nougat/multicolcnn.md) |
32
 
33
  ## Performance
34
 
@@ -106,7 +105,7 @@ Options:
106
  - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
107
  - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
108
  - `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "eng,fra,deu"` for English, French, and German.
109
- - `-l`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
110
 
111
  The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
112
 
@@ -117,7 +116,7 @@ marker /path/to/input/folder --workers 10
117
  ```
118
 
119
  - `marker` supports all the same options from `marker_single` above.
120
- - `--workers` is the number of conversion workers to run simultaneously. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
121
 
122
  ## Convert multiple files on multiple GPUs
123
 
 
25
  ## Examples
26
 
27
  | PDF | Markdown | JSON |
28
+ | [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/thinkpython/thinkpython.md) | |
29
+ | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/switch_transformers/switch_transformers.md) |
30
+ | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/marker/switch_transformers/multicolcnn.md) |
 
31
 
32
  ## Performance
33
 
 
105
  - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
106
  - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
107
  - `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "eng,fra,deu"` for English, French, and German.
108
+ - `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
109
 
110
  The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
111
 
 
116
  ```
117
 
118
  - `marker` supports all the same options from `marker_single` above.
119
+ - `--workers` is the number of conversion workers to run simultaneously. This is set to 5 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
120
 
121
  ## Convert multiple files on multiple GPUs
122
 
marker/builders/ocr.py CHANGED
@@ -71,7 +71,7 @@ class OcrBuilder(BaseBuilder):
71
  det_processor=self.detection_model.processor,
72
  rec_model=self.recognition_model,
73
  rec_processor=self.recognition_model.processor,
74
- batch_size=int(self.get_recognition_batch_size()),
75
  highres_images=[page.highres_image for page in page_list]
76
  )
77
 
 
71
  det_processor=self.detection_model.processor,
72
  rec_model=self.recognition_model,
73
  rec_processor=self.recognition_model.processor,
74
+ recognition_batch_size=int(self.get_recognition_batch_size()),
75
  highres_images=[page.highres_image for page in page_list]
76
  )
77
 
marker/processors/code.py CHANGED
@@ -19,6 +19,10 @@ class CodeProcessor(BaseProcessor):
19
  min_left = 9999 # will contain x- coord of column 0
20
  total_width = 0
21
  total_chars = 0
 
 
 
 
22
  for line_id in block.structure:
23
  line = document.get_block(line_id)
24
  min_left = min(line.polygon.bbox[0], min_left)
 
19
  min_left = 9999 # will contain x- coord of column 0
20
  total_width = 0
21
  total_chars = 0
22
+ if block.structure is None:
23
+ block.code = ""
24
+ return
25
+
26
  for line_id in block.structure:
27
  line = document.get_block(line_id)
28
  min_left = min(line.polygon.bbox[0], min_left)
marker/processors/footnote.py CHANGED
@@ -15,45 +15,62 @@ from marker.schema.groups import PageGroup
15
  class FootnoteProcessor(BaseProcessor):
16
  """
17
  A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
 
 
 
 
 
 
 
 
 
18
  """
19
  block_types = (BlockTypes.Footnote,)
20
- page_bottom_threshold = .66
21
- font_size_scaler = .5
22
- line_height_scaler = .5
23
 
24
  def __call__(self, document: Document):
 
 
 
 
 
25
  for page in document.pages:
26
- self.relabel_texts_to_footnotes(page, document)
27
  self.push_footnotes_to_bottom(page, document)
28
 
 
 
 
 
 
 
 
 
29
 
30
- def relabel_texts_to_footnotes(self, page: PageGroup, document: Document):
31
  text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
32
  block_stats = []
33
 
34
  for block in text_blocks:
35
- contained_spans = block.contained_blocks(document, (BlockTypes.Span,))
36
- font_size = [span.font_size for span in contained_spans]
37
  contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
38
  line_heights = [line.polygon.height for line in contained_lines]
39
 
40
  block_stats.append({
41
- "font_size": mean(font_size),
42
- "line_height": mean(line_heights),
43
- "line_heights": line_heights,
44
- "font_sizes": font_size,
45
- "in_bottom_third": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
46
  })
47
 
48
  # Find the average font size and line height
49
- avg_font_size = mean([fs for bs in block_stats for fs in bs["font_sizes"]])
50
- avg_line_height = mean([lh for bs in block_stats for lh in bs["line_heights"]])
51
 
 
52
  for text_block, stats_dict in zip(text_blocks, block_stats):
53
  if all([
54
- stats_dict["font_size"] < avg_font_size * self.font_size_scaler,
55
- stats_dict["line_height"] < avg_line_height * self.line_height_scaler,
56
- stats_dict["in_bottom_third"]
57
  ]):
58
  new_block = Footnote.from_block(text_block)
59
  page.replace_block(text_block, new_block)
 
15
  class FootnoteProcessor(BaseProcessor):
16
  """
17
  A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
18
+
19
+ Attributes:
20
+ page_bottom_threshold (float):
21
+ The fraction of page height that is considered the bottom.
22
+ Default is .75
23
+
24
+ line_height_scaler (float):
25
+ The amount to scale line height by to consider a block a footnote.
26
+ Default is .5
27
  """
28
  block_types = (BlockTypes.Footnote,)
29
+ page_bottom_threshold = .75
30
+ line_height_scaler = .85
31
+
32
 
33
  def __call__(self, document: Document):
34
+ footnote_heights = self.compute_block_stats(document)
35
+ if len(footnote_heights) == 0:
36
+ footnote_heights = [999]
37
+
38
+ avg_footnote_height = mean(footnote_heights)
39
  for page in document.pages:
40
+ self.relabel_texts_to_footnotes(page, document, avg_footnote_height)
41
  self.push_footnotes_to_bottom(page, document)
42
 
43
+ def compute_block_stats(self, document: Document):
44
+ line_heights = []
45
+ for page in document.pages:
46
+ for footnote in page.contained_blocks(document, self.block_types):
47
+ contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
48
+ line_heights.extend([line.polygon.height for line in contained_lines])
49
+ return line_heights
50
+
51
 
52
+ def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_footnote_height: int):
53
  text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
54
  block_stats = []
55
 
56
  for block in text_blocks:
 
 
57
  contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
58
  line_heights = [line.polygon.height for line in contained_lines]
59
 
60
  block_stats.append({
61
+ "line_height": mean(line_heights) if len(line_heights) > 0 else 999,
62
+ "in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
 
 
 
63
  })
64
 
65
  # Find the average font size and line height
66
+ if len(block_stats) == 0:
67
+ return
68
 
69
+ height_gap = 1 - self.line_height_scaler
70
  for text_block, stats_dict in zip(text_blocks, block_stats):
71
  if all([
72
+ avg_footnote_height * self.line_height_scaler < stats_dict["line_height"] < avg_footnote_height * (1 + height_gap),
73
+ stats_dict["in_bottom"]
 
74
  ]):
75
  new_block = Footnote.from_block(text_block)
76
  page.replace_block(text_block, new_block)
marker/schema/blocks/base.py CHANGED
@@ -77,7 +77,7 @@ class Block(BaseModel):
77
 
78
  @classmethod
79
  def from_block(cls, block: Block) -> Block:
80
- block_attrs = block.model_dump(exclude=["id", "block_id"])
81
  return cls(**block_attrs)
82
 
83
  def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
 
77
 
78
  @classmethod
79
  def from_block(cls, block: Block) -> Block:
80
+ block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
81
  return cls(**block_attrs)
82
 
83
  def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]: