Vik Paruchuri commited on
Commit
9e13e0a
·
1 Parent(s): 4b824ac

Small bugfix

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. marker/output.py +19 -1
  3. marker/processors/list.py +4 -0
  4. pyproject.toml +1 -1
README.md CHANGED
@@ -256,7 +256,7 @@ Pages have the keys:
256
 
257
  - `id` - unique id for the block.
258
  - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
259
- - `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/renderers/__init__.py:BaseRender.extract_block_html`.
260
  - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
261
  - `children` - the child blocks.
262
 
 
256
 
257
  - `id` - unique id for the block.
258
  - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
259
+ - `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML.
260
  - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
261
  - `children` - the child blocks.
262
 
marker/output.py CHANGED
@@ -1,13 +1,31 @@
1
  import json
2
  import os
3
 
 
4
  from pydantic import BaseModel
5
 
6
  from marker.renderers.html import HTMLOutput
7
- from marker.renderers.json import JSONOutput
8
  from marker.renderers.markdown import MarkdownOutput
9
  from marker.settings import settings
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def output_exists(output_dir: str, fname_base: str):
13
  exts = ["md", "html", "json"]
 
1
  import json
2
  import os
3
 
4
+ from bs4 import BeautifulSoup
5
  from pydantic import BaseModel
6
 
7
  from marker.renderers.html import HTMLOutput
8
+ from marker.renderers.json import JSONOutput, JSONBlockOutput
9
  from marker.renderers.markdown import MarkdownOutput
10
  from marker.settings import settings
11
 
12
+ def json_to_html(block: JSONBlockOutput):
13
+ # Utility function to take in json block output and give html for the block.
14
+ if not getattr(block, "children", None):
15
+ return block.html
16
+ else:
17
+ child_html = [json_to_html(child) for child in block.children]
18
+ child_ids = [child.id for child in block.children]
19
+
20
+ soup = BeautifulSoup(block.html, "html.parser")
21
+ content_refs = soup.find_all("content-ref")
22
+ for ref in content_refs:
23
+ src_id = ref.attrs["src"]
24
+ if src_id in child_ids:
25
+ child_soup = BeautifulSoup(child_html[child_ids.index(src_id)], "html.parser")
26
+ ref.replace_with(child_soup)
27
+ return str(soup)
28
+
29
 
30
  def output_exists(output_dir: str, fname_base: str):
31
  exts = ["md", "html", "json"]
marker/processors/list.py CHANGED
@@ -66,6 +66,10 @@ class ListProcessor(BaseProcessor):
66
  for list_item_id in block.structure:
67
  list_item_block: ListItem = page.get_block(list_item_id)
68
 
 
 
 
 
69
  while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
70
  stack.pop()
71
 
 
66
  for list_item_id in block.structure:
67
  list_item_block: ListItem = page.get_block(list_item_id)
68
 
69
+ # This can be a line sometimes
70
+ if list_item_block.block_type != BlockTypes.ListItem:
71
+ continue
72
+
73
  while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
74
  stack.pop()
75
 
pyproject.toml CHANGED
@@ -28,7 +28,7 @@ texify = "^0.2.1"
28
  rapidfuzz = "^3.8.1"
29
  surya-ocr = "~0.9.3"
30
  regex = "^2024.4.28"
31
- pdftext = "~0.5.0"
32
  markdownify = "^0.13.1"
33
  click = "^8.1.7"
34
  google-generativeai = "^0.8.3"
 
28
  rapidfuzz = "^3.8.1"
29
  surya-ocr = "~0.9.3"
30
  regex = "^2024.4.28"
31
+ pdftext = "~0.5.1"
32
  markdownify = "^0.13.1"
33
  click = "^8.1.7"
34
  google-generativeai = "^0.8.3"