Vik Paruchuri
commited on
Commit
·
9e13e0a
1
Parent(s):
4b824ac
Small bugfix
Browse files- README.md +1 -1
- marker/output.py +19 -1
- marker/processors/list.py +4 -0
- pyproject.toml +1 -1
README.md
CHANGED
|
@@ -256,7 +256,7 @@ Pages have the keys:
|
|
| 256 |
|
| 257 |
- `id` - unique id for the block.
|
| 258 |
- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
|
| 259 |
-
- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/
|
| 260 |
- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
|
| 261 |
- `children` - the child blocks.
|
| 262 |
|
|
|
|
| 256 |
|
| 257 |
- `id` - unique id for the block.
|
| 258 |
- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
|
| 259 |
+
- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML.
|
| 260 |
- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
|
| 261 |
- `children` - the child blocks.
|
| 262 |
|
marker/output.py
CHANGED
|
@@ -1,13 +1,31 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.renderers.html import HTMLOutput
|
| 7 |
-
from marker.renderers.json import JSONOutput
|
| 8 |
from marker.renderers.markdown import MarkdownOutput
|
| 9 |
from marker.settings import settings
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def output_exists(output_dir: str, fname_base: str):
|
| 13 |
exts = ["md", "html", "json"]
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
| 7 |
from marker.renderers.html import HTMLOutput
|
| 8 |
+
from marker.renderers.json import JSONOutput, JSONBlockOutput
|
| 9 |
from marker.renderers.markdown import MarkdownOutput
|
| 10 |
from marker.settings import settings
|
| 11 |
|
| 12 |
+
def json_to_html(block: JSONBlockOutput):
|
| 13 |
+
# Utility function to take in json block output and give html for the block.
|
| 14 |
+
if not getattr(block, "children", None):
|
| 15 |
+
return block.html
|
| 16 |
+
else:
|
| 17 |
+
child_html = [json_to_html(child) for child in block.children]
|
| 18 |
+
child_ids = [child.id for child in block.children]
|
| 19 |
+
|
| 20 |
+
soup = BeautifulSoup(block.html, "html.parser")
|
| 21 |
+
content_refs = soup.find_all("content-ref")
|
| 22 |
+
for ref in content_refs:
|
| 23 |
+
src_id = ref.attrs["src"]
|
| 24 |
+
if src_id in child_ids:
|
| 25 |
+
child_soup = BeautifulSoup(child_html[child_ids.index(src_id)], "html.parser")
|
| 26 |
+
ref.replace_with(child_soup)
|
| 27 |
+
return str(soup)
|
| 28 |
+
|
| 29 |
|
| 30 |
def output_exists(output_dir: str, fname_base: str):
|
| 31 |
exts = ["md", "html", "json"]
|
marker/processors/list.py
CHANGED
|
@@ -66,6 +66,10 @@ class ListProcessor(BaseProcessor):
|
|
| 66 |
for list_item_id in block.structure:
|
| 67 |
list_item_block: ListItem = page.get_block(list_item_id)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
|
| 70 |
stack.pop()
|
| 71 |
|
|
|
|
| 66 |
for list_item_id in block.structure:
|
| 67 |
list_item_block: ListItem = page.get_block(list_item_id)
|
| 68 |
|
| 69 |
+
# This can be a line sometimes
|
| 70 |
+
if list_item_block.block_type != BlockTypes.ListItem:
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
|
| 74 |
stack.pop()
|
| 75 |
|
pyproject.toml
CHANGED
|
@@ -28,7 +28,7 @@ texify = "^0.2.1"
|
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
surya-ocr = "~0.9.3"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
-
pdftext = "~0.5.
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
click = "^8.1.7"
|
| 34 |
google-generativeai = "^0.8.3"
|
|
|
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
surya-ocr = "~0.9.3"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
+
pdftext = "~0.5.1"
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
click = "^8.1.7"
|
| 34 |
google-generativeai = "^0.8.3"
|