Vik Paruchuri commited on
Commit
ff24a58
·
1 Parent(s): 6c81421

Add order processor

Browse files
benchmarks/overall/inference.py CHANGED
@@ -23,9 +23,9 @@ def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs
23
  pdf_bytes = sample["pdf"] # This is a single page PDF
24
  start = time.time()
25
  marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
26
- marker_md = clean_input(marker_md)
27
  total = time.time() - start
28
- scores = score_blocks(gt_markdown, marker_md)
29
  scores["time"] = total
30
  scores["markdown"] = marker_md
31
  return scores
@@ -41,8 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwa
41
  if not data:
42
  raise ValueError(f"Could not find data for uuid {uuid}")
43
 
44
- mathpix_md = clean_input(data["md"])
45
- scores = score_blocks(gt_markdown, mathpix_md)
46
  scores["time"] = data["time"]
47
- scores["markdown"] = mathpix_md
48
  return scores
 
23
  pdf_bytes = sample["pdf"] # This is a single page PDF
24
  start = time.time()
25
  marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
26
+ marker_md_clean = clean_input(marker_md)
27
  total = time.time() - start
28
+ scores = score_blocks(gt_markdown, marker_md_clean)
29
  scores["time"] = total
30
  scores["markdown"] = marker_md
31
  return scores
 
41
  if not data:
42
  raise ValueError(f"Could not find data for uuid {uuid}")
43
 
44
+ mathpix_md_clean = clean_input(data["md"])
45
+ scores = score_blocks(gt_markdown, mathpix_md_clean)
46
  scores["time"] = data["time"]
47
+ scores["markdown"] = data["md"]
48
  return scores
benchmarks/overall/overall.py CHANGED
@@ -56,7 +56,7 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
56
  "averages_by_type": averages_by_type,
57
  "averages_by_block_type": averages_by_block_type,
58
  "average_time": avg_time,
59
- "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
60
  }
61
 
62
  def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
 
56
  "averages_by_type": averages_by_type,
57
  "averages_by_block_type": averages_by_block_type,
58
  "average_time": avg_time,
59
+ "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
60
  }
61
 
62
  def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
benchmarks/overall/render.py CHANGED
@@ -12,6 +12,7 @@ import datasets
12
  import markdown2
13
  from playwright.sync_api import sync_playwright
14
 
 
15
  from benchmarks.overall.schema import FullResult
16
 
17
  def convert_to_html(md: str):
@@ -90,7 +91,13 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da
90
 
91
  ds_rows = defaultdict(dict)
92
  for idx in full_idxs:
93
- row = ds[idx] # img, gt_blocks, classification, language, uuid
 
 
 
 
 
 
94
  for method in all_scores:
95
  method_row = all_scores[method]["raw_scores"][idx]
96
  ds_rows[idx].update({
@@ -99,10 +106,11 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da
99
  f"{method}_image": markdown_to_image(method_row["markdown"]),
100
  f"{method}_time": method_row["time"]
101
  })
102
- gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
 
103
  ds_rows[idx].update({
104
  "gt_markdown": gt_md,
105
- "gt_image": markdown_to_image(gt_md)
106
  })
107
  out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
108
  return out_dataset
 
12
  import markdown2
13
  from playwright.sync_api import sync_playwright
14
 
15
+ from benchmarks.overall.clean import convert_to_md, clean_input
16
  from benchmarks.overall.schema import FullResult
17
 
18
  def convert_to_html(md: str):
 
91
 
92
  ds_rows = defaultdict(dict)
93
  for idx in full_idxs:
94
+ row = ds[idx]
95
+ ds_rows[idx].update({
96
+ "img": row["img"],
97
+ "classification": row["classification"],
98
+ "language": row["language"],
99
+ "uuid": row["uuid"]
100
+ })
101
  for method in all_scores:
102
  method_row = all_scores[method]["raw_scores"][idx]
103
  ds_rows[idx].update({
 
106
  f"{method}_image": markdown_to_image(method_row["markdown"]),
107
  f"{method}_time": method_row["time"]
108
  })
109
+ gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
110
+ gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
111
  ds_rows[idx].update({
112
  "gt_markdown": gt_md,
113
+ "gt_markdown_image": markdown_to_image(gt_md)
114
  })
115
  out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
116
  return out_dataset
benchmarks/overall/schema.py CHANGED
@@ -15,3 +15,4 @@ class FullResult(TypedDict):
15
  averages_by_block_type: Dict[str, List[float]]
16
  average_time: float
17
  average_score: float
 
 
15
  averages_by_block_type: Dict[str, List[float]]
16
  average_time: float
17
  average_score: float
18
+ gt_markdown: List[str]
marker/converters/pdf.py CHANGED
@@ -41,6 +41,7 @@ from marker.schema.blocks import Block
41
  from marker.schema.registry import register_block_class
42
  from marker.util import strings_to_classes
43
  from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 
44
 
45
 
46
  class PdfConverter(BaseConverter):
@@ -59,6 +60,7 @@ class PdfConverter(BaseConverter):
59
  "Enable higher quality processing with LLMs.",
60
  ] = False
61
  default_processors: Tuple[BaseProcessor, ...] = (
 
62
  BlockquoteProcessor,
63
  CodeProcessor,
64
  DocumentTOCProcessor,
 
41
  from marker.schema.registry import register_block_class
42
  from marker.util import strings_to_classes
43
  from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
44
+ from marker.processors.order import OrderProcessor
45
 
46
 
47
  class PdfConverter(BaseConverter):
 
60
  "Enable higher quality processing with LLMs.",
61
  ] = False
62
  default_processors: Tuple[BaseProcessor, ...] = (
63
+ OrderProcessor,
64
  BlockquoteProcessor,
65
  CodeProcessor,
66
  DocumentTOCProcessor,
marker/processors/order.py CHANGED
@@ -1,4 +1,5 @@
1
  from statistics import mean
 
2
 
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
@@ -13,41 +14,53 @@ class OrderProcessor(BaseProcessor):
13
 
14
  def __call__(self, document: Document):
15
  for page in document.pages:
 
16
  if page.text_extraction_method != "pdftext":
17
  continue
18
 
 
19
  if not page.layout_sliced:
20
  continue
21
 
22
- block_idxs = {}
23
  for block_id in page.structure:
24
  block = document.get_block(block_id)
25
  spans = block.contained_blocks(document, (BlockTypes.Span, ))
26
  if len(spans) == 0:
27
  continue
28
 
 
29
  block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
30
 
31
  for block_id in page.structure:
32
- if block_id in block_idxs and block_idxs[block_id] > 0:
 
33
  continue
 
34
  block = document.get_block(block_id)
35
  prev_block = document.get_prev_block(block)
36
  next_block = document.get_next_block(block)
37
 
 
 
 
 
38
  while prev_block and prev_block.id not in block_idxs:
39
  prev_block = document.get_prev_block(prev_block)
 
40
 
41
  if not prev_block:
 
42
  while next_block and next_block.id not in block_idxs:
43
  next_block = document.get_next_block(next_block)
 
44
 
45
  if not next_block and not prev_block:
46
- block_idxs[block_id] = 0
47
  elif prev_block:
48
- block_idxs[block_id] = block_idxs[prev_block.id] + 1
49
  else:
50
- block_idxs[block_id] = block_idxs[next_block.id] - 1
51
 
52
  page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
53
 
 
1
  from statistics import mean
2
+ from collections import defaultdict
3
 
4
  from marker.processors import BaseProcessor
5
  from marker.schema import BlockTypes
 
14
 
15
  def __call__(self, document: Document):
16
  for page in document.pages:
17
+ # Skip OCRed pages
18
  if page.text_extraction_method != "pdftext":
19
  continue
20
 
21
+ # Skip pages without layout slicing
22
  if not page.layout_sliced:
23
  continue
24
 
25
+ block_idxs = defaultdict(int)
26
  for block_id in page.structure:
27
  block = document.get_block(block_id)
28
  spans = block.contained_blocks(document, (BlockTypes.Span, ))
29
  if len(spans) == 0:
30
  continue
31
 
32
+ # Avg span position in original PDF
33
  block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
34
 
35
  for block_id in page.structure:
36
+ # Already assigned block id via span position
37
+ if block_idxs[block_id] > 0:
38
  continue
39
+
40
  block = document.get_block(block_id)
41
  prev_block = document.get_prev_block(block)
42
  next_block = document.get_next_block(block)
43
 
44
+ block_idx_add = 0
45
+ if prev_block:
46
+ block_idx_add = 1
47
+
48
  while prev_block and prev_block.id not in block_idxs:
49
  prev_block = document.get_prev_block(prev_block)
50
+ block_idx_add += 1
51
 
52
  if not prev_block:
53
+ block_idx_add = -1
54
  while next_block and next_block.id not in block_idxs:
55
  next_block = document.get_next_block(next_block)
56
+ block_idx_add -= 1
57
 
58
  if not next_block and not prev_block:
59
+ pass
60
  elif prev_block:
61
+ block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
62
  else:
63
+ block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
64
 
65
  page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
66
 
marker/schema/polygon.py CHANGED
@@ -126,6 +126,9 @@ class PolygonBox(BaseModel):
126
  else:
127
  return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight
128
 
 
 
 
129
  def rescale(self, old_size, new_size):
130
  # Point is in x, y format
131
  page_width, page_height = old_size
 
126
  else:
127
  return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight
128
 
129
+ def tl_distance(self, other: PolygonBox):
130
+ return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5
131
+
132
  def rescale(self, old_size, new_size):
133
  # Point is in x, y format
134
  page_width, page_height = old_size
marker/util.py CHANGED
@@ -80,3 +80,22 @@ def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]
80
  height = np.maximum(0, max_y - min_y)
81
 
82
  return width * height # Shape: (N, M)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  height = np.maximum(0, max_y - min_y)
81
 
82
  return width * height # Shape: (N, M)
83
+
84
+
85
+ def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
86
+ if len(boxes2) == 0:
87
+ return np.zeros((len(boxes1), 0))
88
+ if len(boxes1) == 0:
89
+ return np.zeros((0, len(boxes2)))
90
+
91
+ boxes1 = np.array(boxes1) # Shape: (N, 4)
92
+ boxes2 = np.array(boxes2) # Shape: (M, 4)
93
+
94
+ boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2)
95
+ boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2)
96
+
97
+ boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2)
98
+ boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2)
99
+
100
+ distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M)
101
+ return distances