Vik Paruchuri commited on
Commit
a19295b
·
1 Parent(s): 2bcf51e

Clean up benchmark, make more pluggable

Browse files
benchmarks/__init__.py ADDED
File without changes
benchmarks/overall/display/__init__.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, List
3
+
4
+ import tabulate
5
+
6
+ from benchmarks.overall.schema import FullResult
7
+
8
+ def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
9
+ table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10
+ with open(out_path / filename, "w", encoding="utf-8") as f:
11
+ f.write(f"# {title}\n")
12
+ f.write(table)
13
+ print(title)
14
+ print(table)
15
+
16
+
17
+ def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18
+ document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19
+ headers = ["Document Type"]
20
+ for method in methods:
21
+ for score_type in score_types:
22
+ headers.append(f"{method} {score_type}")
23
+
24
+ document_rows = [[k] for k in document_types]
25
+ for i, doc_type in enumerate(document_types):
26
+ for method in methods:
27
+ for score_type in score_types:
28
+ avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29
+ document_rows[i].append(avg_score)
30
+
31
+ write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32
+
33
+ headers = ["Block Type"]
34
+ block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35
+ block_score_types = list(result["averages_by_block_type"][default_method].keys())
36
+ for method in methods:
37
+ for score_type in block_score_types:
38
+ headers.append(f"{method} {score_type}")
39
+
40
+ block_rows = [[k] for k in block_types]
41
+ for i, block_type in enumerate(block_types):
42
+ for method in methods:
43
+ for score_type in block_score_types:
44
+ avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45
+ block_rows[i].append(avg_score)
46
+
47
+ write_table("Block types", block_rows, headers, out_path, "block_types.md")
48
+
49
+ headers = ["Method", "Avg Time"] + score_types
50
+ inference_rows = [[k] for k in methods]
51
+ all_raw_scores = [result["scores"][i] for i in result["scores"]]
52
+ for i, method in enumerate(methods):
53
+ avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54
+ inference_rows[i].append(avg_time)
55
+ for score_type in score_types:
56
+ scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
57
+ avg_score = sum(scores_lst) / max(1, len(scores_lst))
58
+ inference_rows[i].append(avg_score)
59
+
60
+ write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
61
+
62
+ print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
benchmarks/overall/inference.py DELETED
@@ -1,48 +0,0 @@
1
- import tempfile
2
- import time
3
-
4
- from benchmarks.overall.clean import clean_input
5
- from benchmarks.overall.schema import BlockScores
6
- from benchmarks.overall.scoring import score_blocks
7
- from marker.converters.pdf import PdfConverter
8
-
9
- def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
10
- block_converter = PdfConverter(
11
- artifact_dict=marker_models,
12
- config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
13
- )
14
-
15
- with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
16
- f.write(pdf_bytes)
17
- rendered = block_converter(f.name)
18
-
19
- return rendered.markdown
20
-
21
-
22
- def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
23
- pdf_bytes = sample["pdf"] # This is a single page PDF
24
- start = time.time()
25
- marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
26
- marker_md_clean = clean_input(marker_md)
27
- total = time.time() - start
28
- scores = score_blocks(gt_markdown, marker_md_clean)
29
- scores["time"] = total
30
- scores["markdown"] = marker_md
31
- return scores
32
-
33
-
34
- def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
35
- uuid = sample["uuid"]
36
- data = None
37
- for row in mathpix_ds:
38
- if str(row["uuid"]) == str(uuid):
39
- data = row
40
- break
41
- if not data:
42
- raise ValueError(f"Could not find data for uuid {uuid}")
43
-
44
- mathpix_md_clean = clean_input(data["md"])
45
- scores = score_blocks(gt_markdown, mathpix_md_clean)
46
- scores["time"] = data["time"]
47
- scores["markdown"] = data["md"]
48
- return scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/overall/methods/__init__.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import re
3
+ from typing import Tuple
4
+
5
+ import markdown2
6
+ from PIL import Image
7
+ from playwright.sync_api import sync_playwright
8
+
9
+ from benchmarks.overall.methods.schema import BenchmarkResult
10
+ from marker.renderers.markdown import MarkdownRenderer
11
+
12
+
13
+ class BaseMethod:
14
+ def __init__(self, **kwargs):
15
+ for kwarg in kwargs:
16
+ if hasattr(self, kwarg):
17
+ setattr(self, kwarg, kwargs[kwarg])
18
+
19
+ @staticmethod
20
+ def convert_to_md(html: str):
21
+ md = MarkdownRenderer()
22
+ markdown = md.md_cls.convert(html)
23
+ return markdown
24
+
25
+ def __call__(self, sample) -> BenchmarkResult:
26
+ raise NotImplementedError()
27
+
28
+ def render(self, markdown: str):
29
+ return self.html_to_image(self.convert_to_html(markdown))
30
+
31
+ @staticmethod
32
+ def convert_to_html(md: str):
33
+ block_placeholders = []
34
+ inline_placeholders = []
35
+
36
+ # Add placeholders for the math
37
+ def block_sub(match):
38
+ content = match.group(1)
39
+ placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
40
+ block_placeholders.append((placeholder, f"$${content}$$"))
41
+ return placeholder
42
+
43
+ def inline_sub(match):
44
+ content = match.group(1)
45
+ placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
46
+ inline_placeholders.append((placeholder, f"${content}$"))
47
+ return placeholder
48
+
49
+ md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
50
+ md = re.sub(r'\$(.*?)\$', inline_sub, md)
51
+
52
+ html = markdown2.markdown(md, extras=['tables'])
53
+
54
+ # Replace placeholders
55
+ for placeholder, math_str in block_placeholders:
56
+ html = html.replace(placeholder, math_str)
57
+ for placeholder, math_str in inline_placeholders:
58
+ html = html.replace(placeholder, math_str)
59
+
60
+ return html
61
+
62
+ def html_to_image(self, html: str) -> Image.Image:
63
+ with sync_playwright() as p:
64
+ browser = p.chromium.launch()
65
+ page = browser.new_page()
66
+ page.set_content(f"""
67
+ <head>
68
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
69
+ <!-- The loading of KaTeX is deferred to speed up page rendering -->
70
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
71
+ <!-- To automatically render math in text elements, include the auto-render extension: -->
72
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
73
+ </head>
74
+ <body>
75
+ {html}
76
+ <script>
77
+ renderMathInElement(document.body, {{
78
+ delimiters: [
79
+ {{left: '$$', right: '$$', display: true}},
80
+ {{left: '$', right: '$', display: false}}
81
+ ]
82
+ }});
83
+ </script>
84
+ </body>
85
+ """)
86
+ page.set_viewport_size({"width": 1200, "height": 800})
87
+ page.wait_for_timeout(500) # Wait for KaTeX to render
88
+ screenshot_bytes = page.screenshot(full_page=True)
89
+ browser.close()
90
+
91
+ return Image.open(io.BytesIO(screenshot_bytes))
benchmarks/overall/methods/gt.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import json
3
+
4
+ from PIL import Image
5
+
6
+ from benchmarks.overall.methods import BaseMethod, BenchmarkResult
7
+
8
+
9
+ class GTMethod(BaseMethod):
10
+ def __call__(self, sample) -> BenchmarkResult:
11
+ gt_blocks = json.loads(sample["gt_blocks"])
12
+ gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
13
+ gt_markdown = [self.convert_to_md(block) for block in gt_html]
14
+ return {
15
+ "markdown": gt_markdown,
16
+ "time": 0
17
+ }
18
+
19
+ def render(self, html: List[str]) -> Image.Image:
20
+ joined = "\n\n".join(html)
21
+ html = f"""
22
+ <html>
23
+ <head></head>
24
+ <body>
25
+ {joined}
26
+ </body>
27
+ </html>
28
+ """.strip()
29
+ return self.html_to_image(html)
benchmarks/overall/methods/marker.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import time
3
+
4
+ from benchmarks.overall.methods import BaseMethod, BenchmarkResult
5
+ from marker.converters.pdf import PdfConverter
6
+
7
+
8
+ class MarkerMethod(BaseMethod):
9
+ model_dict: dict = None
10
+ use_llm: bool = False
11
+
12
+ def __call__(self, sample) -> BenchmarkResult:
13
+ pdf_bytes = sample["pdf"] # This is a single page PDF
14
+ block_converter = PdfConverter(
15
+ artifact_dict=self.model_dict,
16
+ config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm}
17
+ )
18
+
19
+ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
20
+ f.write(pdf_bytes)
21
+ start = time.time()
22
+ rendered = block_converter(f.name)
23
+ total = time.time() - start
24
+
25
+ return {
26
+ "markdown": rendered.markdown,
27
+ "time": total
28
+ }
29
+
benchmarks/overall/methods/mathpix.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+
3
+ from benchmarks.overall.methods import BaseMethod, BenchmarkResult
4
+
5
+
6
+ class MathpixMethod(BaseMethod):
7
+ mathpix_ds: datasets.Dataset = None
8
+
9
+ def __call__(self, sample) -> BenchmarkResult:
10
+ uuid = sample["uuid"]
11
+ data = None
12
+ for row in self.mathpix_ds:
13
+ if str(row["uuid"]) == str(uuid):
14
+ data = row
15
+ break
16
+ if not data:
17
+ raise ValueError(f"Could not find data for uuid {uuid}")
18
+
19
+ return {
20
+ "markdown": data["md"],
21
+ "time": data["time"]
22
+ }
benchmarks/overall/methods/schema.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from typing import TypedDict, List
2
+
3
+
4
+ class BenchmarkResult(TypedDict):
5
+ markdown: str | List[str]
6
+ time: float | None
benchmarks/overall/overall.py CHANGED
@@ -2,117 +2,86 @@ import json
2
  import os
3
  from collections import defaultdict
4
  from pathlib import Path
5
- from typing import Dict
6
 
7
  import click
8
  import datasets
9
- import tabulate
10
- from benchmarks.overall.render import build_dataset
11
  from tqdm import tqdm
12
- import pypdfium2 as pdfium
13
 
14
- from benchmarks.overall.clean import convert_to_md, clean_input
15
- from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
16
  from benchmarks.overall.schema import FullResult
17
  from marker.logger import configure_logging
18
  from marker.models import create_model_dict
19
  from marker.settings import settings
 
20
 
21
  configure_logging()
22
 
23
 
24
- def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult:
25
  bench_scores = {}
26
- averages_by_type = defaultdict(list)
27
- averages_by_block_type = defaultdict(list)
28
- for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
 
 
29
  if max_rows is not None and idx >= max_rows:
30
  break
31
 
32
- gt_blocks = json.loads(sample["gt_blocks"])
33
  doc_type = sample["classification"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- try:
36
- gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
37
- gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
38
- scores = score_func(model_dict, sample, gt_markdown, **kwargs)
39
- except ValueError as e:
40
- print(f"Error with sample {idx}: {e}")
41
- continue
42
- except pdfium.PdfiumError as e:
43
- print(f"Error opening pdf: {e}")
44
- continue
45
 
46
- averages_by_type[doc_type].append(scores["overall_score"])
47
 
48
- for score, gt_block in zip(scores["scores"], gt_blocks):
49
- averages_by_block_type[gt_block["block_type"]].append(score)
 
50
 
51
- bench_scores[idx] = scores
52
 
53
- avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
54
  return {
55
- "raw_scores": bench_scores,
 
56
  "averages_by_type": averages_by_type,
57
  "averages_by_block_type": averages_by_block_type,
58
- "average_time": avg_time,
59
- "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
60
  }
61
 
62
- def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
63
- inference_types = [default_method] + [k for k in scores.keys() if k != default_method]
64
-
65
- document_types = list(scores[default_method]["averages_by_type"].keys())
66
- document_rows = [[k] for k in document_types]
67
- for k in inference_types:
68
- for i, doc_type in enumerate(document_types):
69
- avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
70
- document_rows[i].append(avg)
71
-
72
- print("Document types")
73
- document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
74
- print(document_type_table)
75
- with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
76
- f.write(document_type_table)
77
-
78
- block_types = list(scores[default_method]["averages_by_block_type"].keys())
79
- block_rows = [[k] for k in block_types]
80
- for k in inference_types:
81
- for i, block_type in enumerate(block_types):
82
- avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
83
- block_rows[i].append(avg)
84
-
85
- print("Block types")
86
- block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
87
- print(block_type_table)
88
- with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
89
- f.write(block_type_table)
90
-
91
- headers = ["Method", "Avg Score", "Avg Time"]
92
- inference_rows = [[k] for k in inference_types]
93
- for i, k in enumerate(inference_types):
94
- inference_rows[i].append(scores[k]["average_score"])
95
- inference_rows[i].append(scores[k]["average_time"])
96
-
97
- print("Overall")
98
- overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
99
- print(overall_table)
100
- with open(out_path / "overall.md", "w", encoding="utf-8") as f:
101
- f.write(overall_table)
102
-
103
- print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
104
-
105
  @click.command(help="Benchmark PDF to MD conversion.")
106
  @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
107
  @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
108
- @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
 
109
  @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
110
  @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
111
  @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
112
  def main(
113
  dataset: str,
114
  out_dataset: str,
115
- other_methods: str,
 
116
  result_path: str,
117
  max_rows: int,
118
  use_llm: bool
@@ -120,37 +89,35 @@ def main(
120
  out_path = Path(result_path)
121
  out_path.mkdir(parents=True, exist_ok=True)
122
 
123
- allowed_methods = ["mathpix", ""]
124
- methods = other_methods.split(",")
125
  for method in methods:
126
- if method not in allowed_methods:
127
- raise ValueError(f"Method {method} not allowed. Allowed methods are {allowed_methods}")
128
-
129
- model_dict = create_model_dict()
130
- ds = datasets.load_dataset(dataset, split="train")
131
-
132
- marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm)
133
- all_scores = {
134
- "marker": marker_scores
 
 
 
 
 
 
 
 
135
  }
136
 
137
- if "mathpix" in methods:
138
- mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
139
- mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
140
- all_scores["mathpix"] = mathpix_scores
141
-
142
- # Display formatted score tables
143
- print_scores(all_scores, out_path)
144
 
145
- with open(out_path / "overall.json", "w", encoding="utf-8") as f:
146
- json.dump(all_scores, f, indent=2, ensure_ascii=False)
147
 
148
- print(f"Results saved to {out_path}.")
 
149
 
150
- # Push up comparison dataset
151
- if out_dataset is not None:
152
- out_ds = build_dataset(ds, all_scores)
153
- out_ds.push_to_hub(out_dataset)
154
 
155
  if __name__ == "__main__":
156
  main()
 
2
  import os
3
  from collections import defaultdict
4
  from pathlib import Path
5
+ from typing import List
6
 
7
  import click
8
  import datasets
 
 
9
  from tqdm import tqdm
 
10
 
11
+ from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
 
12
  from benchmarks.overall.schema import FullResult
13
  from marker.logger import configure_logging
14
  from marker.models import create_model_dict
15
  from marker.settings import settings
16
+ from benchmarks.overall.display import print_scores
17
 
18
  configure_logging()
19
 
20
 
21
+ def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
22
  bench_scores = {}
23
+ averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
24
+ averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
25
+ average_times = defaultdict(list)
26
+ markdown_by_method = defaultdict(dict)
27
+ for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark"):
28
  if max_rows is not None and idx >= max_rows:
29
  break
30
 
 
31
  doc_type = sample["classification"]
32
+ gt_cls = METHOD_REGISTRY["gt"]
33
+ gt_blocks = json.loads(sample["gt_blocks"])
34
+ gt_md = gt_cls(**artifacts)(sample)["markdown"]
35
+
36
+ out_data = defaultdict(dict)
37
+
38
+ for method in methods:
39
+ method_cls = METHOD_REGISTRY[method](**artifacts)
40
+ method_info = method_cls(sample)
41
+ method_md = method_info["markdown"]
42
+ average_times[method].append(method_info["time"])
43
+ markdown_by_method[idx][method] = method_md
44
+
45
+ for score_type in score_types:
46
+ score_cls = SCORE_REGISTRY[score_type]()
47
+ try:
48
+ scores = score_cls(sample, gt_md, method_md)
49
+ except Exception as e:
50
+ # Some scorers can fail, like the LLM one
51
+ print(f"Failed to score {method} with {score_type}: {e}")
52
+ continue
53
 
54
+ out_data[method][score_type] = scores
 
 
 
 
 
 
 
 
 
55
 
56
+ averages_by_type[method][score_type][doc_type].append(scores["score"])
57
 
58
+ if "by_block" in scores["specific_scores"]: # Not all scorers support this
59
+ for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
60
+ averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
61
 
62
+ bench_scores[idx] = out_data
63
 
 
64
  return {
65
+ "scores": bench_scores,
66
+ "markdown": markdown_by_method,
67
  "averages_by_type": averages_by_type,
68
  "averages_by_block_type": averages_by_block_type,
69
+ "average_times": average_times,
 
70
  }
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  @click.command(help="Benchmark PDF to MD conversion.")
73
  @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
74
  @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
75
+ @click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix", default="marker")
76
+ @click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic")
77
  @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
78
  @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
79
  @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
80
  def main(
81
  dataset: str,
82
  out_dataset: str,
83
+ methods: str,
84
+ scores: str,
85
  result_path: str,
86
  max_rows: int,
87
  use_llm: bool
 
89
  out_path = Path(result_path)
90
  out_path.mkdir(parents=True, exist_ok=True)
91
 
92
+ methods = methods.split(",")
 
93
  for method in methods:
94
+ if method not in METHOD_REGISTRY:
95
+ raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}")
96
+
97
+ # Ensure marker is always first
98
+ methods = list(set(methods))
99
+ methods = ["marker"] + [m for m in methods if m != "marker"]
100
+
101
+ score_types = scores.split(",")
102
+ for score_type in score_types:
103
+ if score_type not in SCORE_REGISTRY:
104
+ raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}")
105
+
106
+ benchmark_dataset = datasets.load_dataset(dataset, split="train")
107
+ artifacts = {
108
+ "model_dict": create_model_dict(),
109
+ "mathpix_ds": datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train"),
110
+ "use_llm": use_llm
111
  }
112
 
113
+ result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
 
 
 
 
 
 
114
 
115
+ # Display benchmark scoring tables
116
+ print_scores(result, out_path, methods, score_types)
117
 
118
+ with open(out_path / "result.json", "w") as f:
119
+ json.dump(result, f)
120
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  main()
benchmarks/overall/registry.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from benchmarks.overall.methods.gt import GTMethod
2
+ from benchmarks.overall.methods.marker import MarkerMethod
3
+ from benchmarks.overall.methods.mathpix import MathpixMethod
4
+ from benchmarks.overall.scorers.heuristic import HeuristicScorer
5
+ from benchmarks.overall.scorers.llm import LLMScorer
6
+
7
+ SCORE_REGISTRY = {
8
+ "heuristic": HeuristicScorer,
9
+ "llm": LLMScorer
10
+ }
11
+
12
+ METHOD_REGISTRY = {
13
+ "marker": MarkerMethod,
14
+ "gt": GTMethod,
15
+ "mathpix": MathpixMethod
16
+ }
benchmarks/overall/render.py DELETED
@@ -1,117 +0,0 @@
1
- import subprocess
2
- import tempfile
3
- import pypdfium2 as pdfium
4
- from typing import Dict
5
- from collections import defaultdict
6
- import re
7
- import io
8
- import json
9
-
10
- from PIL import Image
11
- import datasets
12
- import markdown2
13
- from playwright.sync_api import sync_playwright
14
-
15
- from benchmarks.overall.clean import convert_to_md, clean_input
16
- from benchmarks.overall.schema import FullResult
17
-
18
- def convert_to_html(md: str):
19
- block_placeholders = []
20
- inline_placeholders = []
21
-
22
- # Add placeholders for the math
23
- def block_sub(match):
24
- content = match.group(1)
25
- placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
26
- block_placeholders.append((placeholder, f"$${content}$$"))
27
- return placeholder
28
-
29
- def inline_sub(match):
30
- content = match.group(1)
31
- placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
32
- inline_placeholders.append((placeholder, f"${content}$"))
33
- return placeholder
34
-
35
- md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
36
- md = re.sub(r'\$(.*?)\$', inline_sub, md)
37
-
38
- html = markdown2.markdown(md, extras=['tables'])
39
-
40
- # Replace placeholders
41
- for placeholder, math_str in block_placeholders:
42
- html = html.replace(placeholder, math_str)
43
- for placeholder, math_str in inline_placeholders:
44
- html = html.replace(placeholder, math_str)
45
-
46
- return html
47
-
48
-
49
- def markdown_to_image(md: str) -> Image.Image:
50
- html = convert_to_html(md)
51
- with sync_playwright() as p:
52
- browser = p.chromium.launch()
53
- page = browser.new_page()
54
- page.set_content(f"""
55
- <head>
56
- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
57
- <!-- The loading of KaTeX is deferred to speed up page rendering -->
58
- <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
59
- <!-- To automatically render math in text elements, include the auto-render extension: -->
60
- <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
61
- </head>
62
- <body>
63
- {html}
64
- <script>
65
- renderMathInElement(document.body, {{
66
- delimiters: [
67
- {{left: '$$', right: '$$', display: true}},
68
- {{left: '$', right: '$', display: false}}
69
- ]
70
- }});
71
- </script>
72
- </body>
73
- """)
74
- page.set_viewport_size({"width": 1200, "height": 800})
75
- page.wait_for_timeout(500) # Wait for KaTeX to render
76
- screenshot_bytes = page.screenshot(full_page=True)
77
- browser.close()
78
-
79
- return Image.open(io.BytesIO(screenshot_bytes))
80
-
81
-
82
- def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
83
- # Get all the dataset indices that went through inference
84
- full_idxs = None
85
- for method in all_scores:
86
- result_idxs = list(all_scores[method]["raw_scores"].keys())
87
- if full_idxs is None:
88
- full_idxs = sorted(result_idxs)
89
- else:
90
- full_idxs = [f for f in full_idxs if f in result_idxs]
91
-
92
- ds_rows = defaultdict(dict)
93
- for idx in full_idxs:
94
- row = ds[idx]
95
- ds_rows[idx].update({
96
- "img": row["img"],
97
- "classification": row["classification"],
98
- "language": row["language"],
99
- "uuid": row["uuid"]
100
- })
101
- for method in all_scores:
102
- method_row = all_scores[method]["raw_scores"][idx]
103
- ds_rows[idx].update({
104
- f"{method}_score": method_row["overall_score"],
105
- f"{method}_markdown": method_row["markdown"],
106
- f"{method}_image": markdown_to_image(method_row["markdown"]),
107
- f"{method}_time": method_row["time"]
108
- })
109
- gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
110
- gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
111
- ds_rows[idx].update({
112
- "gt_markdown": gt_md,
113
- "gt_markdown_image": markdown_to_image(gt_md)
114
- })
115
- out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
116
- return out_dataset
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/overall/schema.py CHANGED
@@ -1,18 +1,12 @@
1
- from typing import TypedDict, List, Dict, Optional
2
 
 
3
 
4
- class BlockScores(TypedDict):
5
- scores: List[float]
6
- order_score: float
7
- overall_score: float
8
- time: Optional[float]
9
- markdown: str
10
-
11
 
12
  class FullResult(TypedDict):
13
- raw_scores: Dict[int, BlockScores]
14
- averages_by_type: Dict[str, List[float]]
15
- averages_by_block_type: Dict[str, List[float]]
16
- average_time: float
17
- average_score: float
18
- gt_markdown: List[str]
 
1
+ from typing import TypedDict, List, Dict
2
 
3
+ from benchmarks.overall.scorers.schema import BlockScores
4
 
5
+ AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
 
 
 
 
 
 
6
 
7
  class FullResult(TypedDict):
8
+ scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
9
+ averages_by_type: AVG_TYPE
10
+ averages_by_block_type: AVG_TYPE
11
+ average_times: Dict[str, List[float]]
12
+ markdown: Dict[int, Dict[str, str]]
 
benchmarks/overall/scorers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from benchmarks.overall.scorers.schema import BlockScores
4
+
5
+
6
+ class BaseScorer:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
11
+ raise NotImplementedError()
benchmarks/overall/{clean.py → scorers/clean.py} RENAMED
@@ -5,8 +5,6 @@ from pathlib import Path
5
 
6
  import latex2mathml.converter
7
 
8
- from marker.renderers.markdown import MarkdownRenderer
9
-
10
  class MarkdownCleaner:
11
  def __init__(self):
12
  pass
@@ -112,14 +110,4 @@ class MarkdownCleaner:
112
  return latex_str
113
 
114
 
115
- def convert_to_md(html):
116
- md = MarkdownRenderer()
117
- markdown = md.md_cls.convert(html)
118
- return markdown
119
-
120
- def clean_input(markdown):
121
- cleaner = MarkdownCleaner()
122
- return cleaner(markdown)
123
-
124
-
125
 
 
5
 
6
  import latex2mathml.converter
7
 
 
 
8
  class MarkdownCleaner:
9
  def __init__(self):
10
  pass
 
110
  return latex_str
111
 
112
 
 
 
 
 
 
 
 
 
 
 
113
 
benchmarks/overall/scorers/heuristic.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from rapidfuzz import fuzz
4
+
5
+ from benchmarks.overall.scorers.clean import MarkdownCleaner
6
+ from benchmarks.overall.scorers.schema import BlockScores
7
+ from benchmarks.overall.scorers import BaseScorer
8
+
9
+
10
+ class HeuristicScorer(BaseScorer):
11
+ def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
12
+ # Standardize inputs
13
+ gt_markdown = [self.clean_input(block) for block in gt_markdown]
14
+ method_markdown = self.clean_input(method_markdown)
15
+
16
+ alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
17
+ scores = [alignment["score"] for alignment in alignments]
18
+
19
+ # Find order score
20
+ orders = [alignment["start"] for alignment in alignments]
21
+ correct_order = list(range(len(gt_markdown)))
22
+ actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
23
+ order_score = self.kendall_tau(correct_order, actual_order)
24
+
25
+ # Weight score by sequence length
26
+ gt_weights = [len(g) for g in gt_markdown]
27
+ weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
28
+
29
+ # Weight the score by sequence length
30
+ overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
31
+ overall_score = overall_score * 0.8 + order_score * 0.2
32
+ return {
33
+ "score": overall_score,
34
+ "specific_scores": {
35
+ "order": order_score,
36
+ "by_block": scores
37
+ },
38
+ }
39
+
40
+ @staticmethod
41
+ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
42
+ n = len(correct_order)
43
+ concordant = 0
44
+ discordant = 0
45
+
46
+ if n <= 1:
47
+ return 100
48
+
49
+ for i in range(n):
50
+ for j in range(i + 1, n):
51
+ correct_sign = correct_order[i] - correct_order[j]
52
+ actual_sign = actual_order[i] - actual_order[j]
53
+
54
+ if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
55
+ concordant += 1
56
+ elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
57
+ discordant += 1
58
+
59
+ total_pairs = (n * (n - 1)) // 2
60
+ tau = (concordant - discordant) / total_pairs
61
+ tau = (tau + 1) / 2 # 0-1 scale
62
+ return tau * 100 # 0-100 scale
63
+
64
+ @staticmethod
65
+ def find_fuzzy_alignments(
66
+ main_string: str,
67
+ substrings: List[str],
68
+ threshold: int = 70
69
+ ) -> List[dict]:
70
+ alignments = []
71
+
72
+ for idx, substr in enumerate(substrings):
73
+ result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
74
+
75
+ score = 0
76
+ dest_start = 0
77
+ dest_end = 0
78
+ if result:
79
+ score = result.score
80
+ dest_start = result.dest_start
81
+ dest_end = result.dest_end
82
+
83
+ alignments.append({
84
+ "string": substr,
85
+ "start": dest_start,
86
+ "end": dest_end,
87
+ "score": score,
88
+ "idx": idx
89
+ })
90
+ return alignments
91
+
92
+
93
+ @staticmethod
94
+ def clean_input(md: str):
95
+ cleaner = MarkdownCleaner()
96
+ return cleaner(md)
benchmarks/overall/scorers/llm.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tempfile
3
+ import time
4
+ from typing import List
5
+
6
+ from PIL import Image
7
+ from google.ai.generativelanguage_v1beta.types import content
8
+ from google.api_core.exceptions import ResourceExhausted
9
+ import pypdfium2 as pdfium
10
+
11
+ from benchmarks.overall.scorers import BaseScorer, BlockScores
12
+ from marker.settings import settings
13
+
14
+ rating_prompt = """
15
+ You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
16
+ You're given an image, along with the extracted markdown:
17
+ - Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
18
+ - Tables will be formatted as Github flavored markdown.
19
+ - Block equations will be in LaTeX.
20
+ - The image and markdown may be in any language.
21
+ - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
22
+
23
+ The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
24
+
25
+ **Instructions**
26
+ Follow this process to evaluate the markdown:
27
+ 1. Carefully examine the image.
28
+ 2. Carefully examine the markdown input provided.
29
+ 3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image?
30
+ 4. Assign component scores, as described below.
31
+
32
+ These are the primary scores:
33
+ - Overall - the overall quality of the markdown as compared to the image.
34
+ - Text quality - the quality of the text extraction from the image.
35
+ - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
36
+
37
+ Depending on which elements are present in the markdown, you will assign element-specific scores.
38
+ - Tables - how effectively the tables have been extracted and formatted.
39
+ - Forms - how effectively the forms have extracted and formatted.
40
+ - Equations - how effectively block equations have been converted to LaTeX.
41
+ - Section headers - if all of the section headers have been detected, and the right levels set.
42
+ - Lists - if the lists have been properly extracted and formatted.
43
+ - Images - if images are identified and placed correctly.
44
+
45
+ Notes on scoring:
46
+ - To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
47
+ - A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
48
+ - A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
49
+ - Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
50
+
51
+ Output json, like in the example below.
52
+
53
+ **Example**
54
+ Input
55
+ ```markdown
56
+ # Section 1
57
+ This is some *markdown* extracted from a document. Here is a block equation:
58
+ $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
59
+ ```
60
+ Output
61
+ ```json
62
+ {
63
+ "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
64
+ "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
65
+ "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.",
66
+ "overall": 5,
67
+ "text": 5,
68
+ "formatting": 5,
69
+ "section_headers": 5,
70
+ "tables": 0,
71
+ "forms": 0,
72
+ "equations": 5,
73
+ "lists": 0,
74
+ "images": 0
75
+ }
76
+ ```
77
+ **Input**
78
+ ```markdown
79
+ {{markdown}}
80
+ ```
81
+ **Output**
82
+ """
83
+
84
+ comparison_keys = ["comparison"]
85
+ description_keys = ["image_description", "markdown_description"]
86
+ text_keys = comparison_keys + description_keys
87
+ score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
88
+ "lists", "images"]
89
+
90
+
91
+ class LLMScorer(BaseScorer):
92
+ def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
93
+ pdf_bytes = sample["pdf"]
94
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
95
+ f.write(pdf_bytes)
96
+ f.flush()
97
+ f.seek(0)
98
+ doc = pdfium.PdfDocument(f.name)
99
+ img = doc[0].render(scale=96/72).to_pil()
100
+ doc.close()
101
+
102
+ return self.llm_rater(img, markdown)
103
+
104
+
105
+ def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
106
+ req_keys = text_keys + score_keys
107
+ properties = {}
108
+ for key in req_keys:
109
+ content_type = content.Type.INTEGER if key in score_keys else content.Type.STRING
110
+ properties[key] = content.Schema(type=content_type)
111
+
112
+ response_schema = content.Schema(
113
+ type=content.Type.OBJECT,
114
+ required=req_keys,
115
+ properties=properties
116
+ )
117
+
118
+ prompt = rating_prompt.replace("{{markdown}}", markdown)
119
+ response = self.llm_response_wrapper([img, prompt], response_schema)
120
+ assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
121
+ return {
122
+ "score": response["overall"],
123
+ "specific_scores": response,
124
+ }
125
+
126
+ def llm_response_wrapper(self, prompt, response_schema, depth=0):
127
+ import google.generativeai as genai
128
+ genai.configure(api_key=settings.GOOGLE_API_KEY)
129
+ gemini_model = genai.GenerativeModel("gemini-1.5-flash")
130
+ try:
131
+ responses = gemini_model.generate_content(
132
+ prompt,
133
+ stream=False,
134
+ generation_config={
135
+ "temperature": 0,
136
+ "response_schema": response_schema,
137
+ "response_mime_type": "application/json",
138
+ },
139
+ request_options={'timeout': 60}
140
+ )
141
+ output = responses.candidates[0].content.parts[0].text
142
+ return json.loads(output)
143
+ except ResourceExhausted as e:
144
+ print(f"Hit Gemini rate limit, waiting 120 seconds")
145
+ time.sleep(120)
146
+ if depth > 2:
147
+ raise e
148
+ return self.llm_response_wrapper(prompt, response_schema, depth + 1)
benchmarks/overall/scorers/schema.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from typing import TypedDict, List, Optional, Dict
2
+
3
+
4
+ class BlockScores(TypedDict):
5
+ score: float
6
+ specific_scores: Dict[str, float | List[float]]
benchmarks/overall/scoring.py DELETED
@@ -1,83 +0,0 @@
1
- from typing import List
2
-
3
- from rapidfuzz import fuzz
4
-
5
- from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
6
- from benchmarks.overall.schema import BlockScores
7
-
8
-
9
- def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
10
- n = len(correct_order)
11
- concordant = 0
12
- discordant = 0
13
-
14
- if n <= 1:
15
- return 100
16
-
17
- for i in range(n):
18
- for j in range(i + 1, n):
19
- correct_sign = correct_order[i] - correct_order[j]
20
- actual_sign = actual_order[i] - actual_order[j]
21
-
22
- if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
23
- concordant += 1
24
- elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
25
- discordant += 1
26
-
27
- total_pairs = (n * (n - 1)) // 2
28
- tau = (concordant - discordant) / total_pairs
29
- tau = (tau + 1) / 2 # 0-1 scale
30
- return tau * 100 # 0-100 scale
31
-
32
-
33
- def find_fuzzy_alignments(
34
- main_string: str,
35
- substrings: List[str],
36
- threshold: int = 70
37
- ) -> List[dict]:
38
- alignments = []
39
-
40
- for idx, substr in enumerate(substrings):
41
- result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
42
-
43
- score = 0
44
- dest_start = 0
45
- dest_end = 0
46
- if result:
47
- score = result.score
48
- dest_start = result.dest_start
49
- dest_end = result.dest_end
50
-
51
- alignments.append({
52
- "string": substr,
53
- "start": dest_start,
54
- "end": dest_end,
55
- "score": score,
56
- "idx": idx
57
- })
58
- return alignments
59
-
60
-
61
- def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
62
- alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
63
- scores = [alignment["score"] for alignment in alignments]
64
-
65
- # Find order score
66
- orders = [alignment["start"] for alignment in alignments]
67
- correct_order = list(range(len(gt_markdown)))
68
- actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
69
- order_score = kendall_tau(correct_order, actual_order)
70
-
71
- # Weight score by sequence length
72
- gt_weights = [len(g) for g in gt_markdown]
73
- weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
74
-
75
- # Weight the score by sequence length
76
- overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
77
- overall_score = overall_score * 0.8 + order_score * 0.2
78
- return {
79
- "scores": scores,
80
- "order_score": order_score,
81
- "overall_score": overall_score,
82
- "time": None
83
- }