Vik Paruchuri
commited on
Commit
·
ff24a58
1
Parent(s):
6c81421
Add order processor
Browse files- benchmarks/overall/inference.py +5 -5
- benchmarks/overall/overall.py +1 -1
- benchmarks/overall/render.py +11 -3
- benchmarks/overall/schema.py +1 -0
- marker/converters/pdf.py +2 -0
- marker/processors/order.py +18 -5
- marker/schema/polygon.py +3 -0
- marker/util.py +19 -0
benchmarks/overall/inference.py
CHANGED
|
@@ -23,9 +23,9 @@ def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs
|
|
| 23 |
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 24 |
start = time.time()
|
| 25 |
marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
|
| 26 |
-
|
| 27 |
total = time.time() - start
|
| 28 |
-
scores = score_blocks(gt_markdown,
|
| 29 |
scores["time"] = total
|
| 30 |
scores["markdown"] = marker_md
|
| 31 |
return scores
|
|
@@ -41,8 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwa
|
|
| 41 |
if not data:
|
| 42 |
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 43 |
|
| 44 |
-
|
| 45 |
-
scores = score_blocks(gt_markdown,
|
| 46 |
scores["time"] = data["time"]
|
| 47 |
-
scores["markdown"] =
|
| 48 |
return scores
|
|
|
|
| 23 |
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 24 |
start = time.time()
|
| 25 |
marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
|
| 26 |
+
marker_md_clean = clean_input(marker_md)
|
| 27 |
total = time.time() - start
|
| 28 |
+
scores = score_blocks(gt_markdown, marker_md_clean)
|
| 29 |
scores["time"] = total
|
| 30 |
scores["markdown"] = marker_md
|
| 31 |
return scores
|
|
|
|
| 41 |
if not data:
|
| 42 |
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 43 |
|
| 44 |
+
mathpix_md_clean = clean_input(data["md"])
|
| 45 |
+
scores = score_blocks(gt_markdown, mathpix_md_clean)
|
| 46 |
scores["time"] = data["time"]
|
| 47 |
+
scores["markdown"] = data["md"]
|
| 48 |
return scores
|
benchmarks/overall/overall.py
CHANGED
|
@@ -56,7 +56,7 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
|
|
| 56 |
"averages_by_type": averages_by_type,
|
| 57 |
"averages_by_block_type": averages_by_block_type,
|
| 58 |
"average_time": avg_time,
|
| 59 |
-
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
|
| 60 |
}
|
| 61 |
|
| 62 |
def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
|
|
|
|
| 56 |
"averages_by_type": averages_by_type,
|
| 57 |
"averages_by_block_type": averages_by_block_type,
|
| 58 |
"average_time": avg_time,
|
| 59 |
+
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
|
| 60 |
}
|
| 61 |
|
| 62 |
def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
|
benchmarks/overall/render.py
CHANGED
|
@@ -12,6 +12,7 @@ import datasets
|
|
| 12 |
import markdown2
|
| 13 |
from playwright.sync_api import sync_playwright
|
| 14 |
|
|
|
|
| 15 |
from benchmarks.overall.schema import FullResult
|
| 16 |
|
| 17 |
def convert_to_html(md: str):
|
|
@@ -90,7 +91,13 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da
|
|
| 90 |
|
| 91 |
ds_rows = defaultdict(dict)
|
| 92 |
for idx in full_idxs:
|
| 93 |
-
row = ds[idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
for method in all_scores:
|
| 95 |
method_row = all_scores[method]["raw_scores"][idx]
|
| 96 |
ds_rows[idx].update({
|
|
@@ -99,10 +106,11 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da
|
|
| 99 |
f"{method}_image": markdown_to_image(method_row["markdown"]),
|
| 100 |
f"{method}_time": method_row["time"]
|
| 101 |
})
|
| 102 |
-
|
|
|
|
| 103 |
ds_rows[idx].update({
|
| 104 |
"gt_markdown": gt_md,
|
| 105 |
-
"
|
| 106 |
})
|
| 107 |
out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
|
| 108 |
return out_dataset
|
|
|
|
| 12 |
import markdown2
|
| 13 |
from playwright.sync_api import sync_playwright
|
| 14 |
|
| 15 |
+
from benchmarks.overall.clean import convert_to_md, clean_input
|
| 16 |
from benchmarks.overall.schema import FullResult
|
| 17 |
|
| 18 |
def convert_to_html(md: str):
|
|
|
|
| 91 |
|
| 92 |
ds_rows = defaultdict(dict)
|
| 93 |
for idx in full_idxs:
|
| 94 |
+
row = ds[idx]
|
| 95 |
+
ds_rows[idx].update({
|
| 96 |
+
"img": row["img"],
|
| 97 |
+
"classification": row["classification"],
|
| 98 |
+
"language": row["language"],
|
| 99 |
+
"uuid": row["uuid"]
|
| 100 |
+
})
|
| 101 |
for method in all_scores:
|
| 102 |
method_row = all_scores[method]["raw_scores"][idx]
|
| 103 |
ds_rows[idx].update({
|
|
|
|
| 106 |
f"{method}_image": markdown_to_image(method_row["markdown"]),
|
| 107 |
f"{method}_time": method_row["time"]
|
| 108 |
})
|
| 109 |
+
gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
|
| 110 |
+
gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
|
| 111 |
ds_rows[idx].update({
|
| 112 |
"gt_markdown": gt_md,
|
| 113 |
+
"gt_markdown_image": markdown_to_image(gt_md)
|
| 114 |
})
|
| 115 |
out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
|
| 116 |
return out_dataset
|
benchmarks/overall/schema.py
CHANGED
|
@@ -15,3 +15,4 @@ class FullResult(TypedDict):
|
|
| 15 |
averages_by_block_type: Dict[str, List[float]]
|
| 16 |
average_time: float
|
| 17 |
average_score: float
|
|
|
|
|
|
| 15 |
averages_by_block_type: Dict[str, List[float]]
|
| 16 |
average_time: float
|
| 17 |
average_score: float
|
| 18 |
+
gt_markdown: List[str]
|
marker/converters/pdf.py
CHANGED
|
@@ -41,6 +41,7 @@ from marker.schema.blocks import Block
|
|
| 41 |
from marker.schema.registry import register_block_class
|
| 42 |
from marker.util import strings_to_classes
|
| 43 |
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
class PdfConverter(BaseConverter):
|
|
@@ -59,6 +60,7 @@ class PdfConverter(BaseConverter):
|
|
| 59 |
"Enable higher quality processing with LLMs.",
|
| 60 |
] = False
|
| 61 |
default_processors: Tuple[BaseProcessor, ...] = (
|
|
|
|
| 62 |
BlockquoteProcessor,
|
| 63 |
CodeProcessor,
|
| 64 |
DocumentTOCProcessor,
|
|
|
|
| 41 |
from marker.schema.registry import register_block_class
|
| 42 |
from marker.util import strings_to_classes
|
| 43 |
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
|
| 44 |
+
from marker.processors.order import OrderProcessor
|
| 45 |
|
| 46 |
|
| 47 |
class PdfConverter(BaseConverter):
|
|
|
|
| 60 |
"Enable higher quality processing with LLMs.",
|
| 61 |
] = False
|
| 62 |
default_processors: Tuple[BaseProcessor, ...] = (
|
| 63 |
+
OrderProcessor,
|
| 64 |
BlockquoteProcessor,
|
| 65 |
CodeProcessor,
|
| 66 |
DocumentTOCProcessor,
|
marker/processors/order.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from statistics import mean
|
|
|
|
| 2 |
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
|
@@ -13,41 +14,53 @@ class OrderProcessor(BaseProcessor):
|
|
| 13 |
|
| 14 |
def __call__(self, document: Document):
|
| 15 |
for page in document.pages:
|
|
|
|
| 16 |
if page.text_extraction_method != "pdftext":
|
| 17 |
continue
|
| 18 |
|
|
|
|
| 19 |
if not page.layout_sliced:
|
| 20 |
continue
|
| 21 |
|
| 22 |
-
block_idxs =
|
| 23 |
for block_id in page.structure:
|
| 24 |
block = document.get_block(block_id)
|
| 25 |
spans = block.contained_blocks(document, (BlockTypes.Span, ))
|
| 26 |
if len(spans) == 0:
|
| 27 |
continue
|
| 28 |
|
|
|
|
| 29 |
block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
|
| 30 |
|
| 31 |
for block_id in page.structure:
|
| 32 |
-
|
|
|
|
| 33 |
continue
|
|
|
|
| 34 |
block = document.get_block(block_id)
|
| 35 |
prev_block = document.get_prev_block(block)
|
| 36 |
next_block = document.get_next_block(block)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
while prev_block and prev_block.id not in block_idxs:
|
| 39 |
prev_block = document.get_prev_block(prev_block)
|
|
|
|
| 40 |
|
| 41 |
if not prev_block:
|
|
|
|
| 42 |
while next_block and next_block.id not in block_idxs:
|
| 43 |
next_block = document.get_next_block(next_block)
|
|
|
|
| 44 |
|
| 45 |
if not next_block and not prev_block:
|
| 46 |
-
|
| 47 |
elif prev_block:
|
| 48 |
-
block_idxs[block_id] = block_idxs[prev_block.id] +
|
| 49 |
else:
|
| 50 |
-
block_idxs[block_id] = block_idxs[next_block.id]
|
| 51 |
|
| 52 |
page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
|
| 53 |
|
|
|
|
| 1 |
from statistics import mean
|
| 2 |
+
from collections import defaultdict
|
| 3 |
|
| 4 |
from marker.processors import BaseProcessor
|
| 5 |
from marker.schema import BlockTypes
|
|
|
|
| 14 |
|
| 15 |
def __call__(self, document: Document):
|
| 16 |
for page in document.pages:
|
| 17 |
+
# Skip OCRed pages
|
| 18 |
if page.text_extraction_method != "pdftext":
|
| 19 |
continue
|
| 20 |
|
| 21 |
+
# Skip pages without layout slicing
|
| 22 |
if not page.layout_sliced:
|
| 23 |
continue
|
| 24 |
|
| 25 |
+
block_idxs = defaultdict(int)
|
| 26 |
for block_id in page.structure:
|
| 27 |
block = document.get_block(block_id)
|
| 28 |
spans = block.contained_blocks(document, (BlockTypes.Span, ))
|
| 29 |
if len(spans) == 0:
|
| 30 |
continue
|
| 31 |
|
| 32 |
+
# Avg span position in original PDF
|
| 33 |
block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
|
| 34 |
|
| 35 |
for block_id in page.structure:
|
| 36 |
+
# Already assigned block id via span position
|
| 37 |
+
if block_idxs[block_id] > 0:
|
| 38 |
continue
|
| 39 |
+
|
| 40 |
block = document.get_block(block_id)
|
| 41 |
prev_block = document.get_prev_block(block)
|
| 42 |
next_block = document.get_next_block(block)
|
| 43 |
|
| 44 |
+
block_idx_add = 0
|
| 45 |
+
if prev_block:
|
| 46 |
+
block_idx_add = 1
|
| 47 |
+
|
| 48 |
while prev_block and prev_block.id not in block_idxs:
|
| 49 |
prev_block = document.get_prev_block(prev_block)
|
| 50 |
+
block_idx_add += 1
|
| 51 |
|
| 52 |
if not prev_block:
|
| 53 |
+
block_idx_add = -1
|
| 54 |
while next_block and next_block.id not in block_idxs:
|
| 55 |
next_block = document.get_next_block(next_block)
|
| 56 |
+
block_idx_add -= 1
|
| 57 |
|
| 58 |
if not next_block and not prev_block:
|
| 59 |
+
pass
|
| 60 |
elif prev_block:
|
| 61 |
+
block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
|
| 62 |
else:
|
| 63 |
+
block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
|
| 64 |
|
| 65 |
page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
|
| 66 |
|
marker/schema/polygon.py
CHANGED
|
@@ -126,6 +126,9 @@ class PolygonBox(BaseModel):
|
|
| 126 |
else:
|
| 127 |
return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
def rescale(self, old_size, new_size):
|
| 130 |
# Point is in x, y format
|
| 131 |
page_width, page_height = old_size
|
|
|
|
| 126 |
else:
|
| 127 |
return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight
|
| 128 |
|
| 129 |
+
def tl_distance(self, other: PolygonBox):
|
| 130 |
+
return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5
|
| 131 |
+
|
| 132 |
def rescale(self, old_size, new_size):
|
| 133 |
# Point is in x, y format
|
| 134 |
page_width, page_height = old_size
|
marker/util.py
CHANGED
|
@@ -80,3 +80,22 @@ def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]
|
|
| 80 |
height = np.maximum(0, max_y - min_y)
|
| 81 |
|
| 82 |
return width * height # Shape: (N, M)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
height = np.maximum(0, max_y - min_y)
|
| 81 |
|
| 82 |
return width * height # Shape: (N, M)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
|
| 86 |
+
if len(boxes2) == 0:
|
| 87 |
+
return np.zeros((len(boxes1), 0))
|
| 88 |
+
if len(boxes1) == 0:
|
| 89 |
+
return np.zeros((0, len(boxes2)))
|
| 90 |
+
|
| 91 |
+
boxes1 = np.array(boxes1) # Shape: (N, 4)
|
| 92 |
+
boxes2 = np.array(boxes2) # Shape: (M, 4)
|
| 93 |
+
|
| 94 |
+
boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2)
|
| 95 |
+
boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2)
|
| 96 |
+
|
| 97 |
+
boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2)
|
| 98 |
+
boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2)
|
| 99 |
+
|
| 100 |
+
distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M)
|
| 101 |
+
return distances
|