Vik Paruchuri
commited on
Commit
·
a19295b
1
Parent(s):
2bcf51e
Clean up benchmark, make more pluggable
Browse files- benchmarks/__init__.py +0 -0
- benchmarks/overall/display/__init__.py +62 -0
- benchmarks/overall/inference.py +0 -48
- benchmarks/overall/methods/__init__.py +91 -0
- benchmarks/overall/methods/gt.py +29 -0
- benchmarks/overall/methods/marker.py +29 -0
- benchmarks/overall/methods/mathpix.py +22 -0
- benchmarks/overall/methods/schema.py +6 -0
- benchmarks/overall/overall.py +66 -99
- benchmarks/overall/registry.py +16 -0
- benchmarks/overall/render.py +0 -117
- benchmarks/overall/schema.py +8 -14
- benchmarks/overall/scorers/__init__.py +11 -0
- benchmarks/overall/{clean.py → scorers/clean.py} +0 -12
- benchmarks/overall/scorers/heuristic.py +96 -0
- benchmarks/overall/scorers/llm.py +148 -0
- benchmarks/overall/scorers/schema.py +6 -0
- benchmarks/overall/scoring.py +0 -83
benchmarks/__init__.py
ADDED
|
File without changes
|
benchmarks/overall/display/__init__.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
import tabulate
|
| 5 |
+
|
| 6 |
+
from benchmarks.overall.schema import FullResult
|
| 7 |
+
|
| 8 |
+
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
|
| 9 |
+
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
|
| 10 |
+
with open(out_path / filename, "w", encoding="utf-8") as f:
|
| 11 |
+
f.write(f"# {title}\n")
|
| 12 |
+
f.write(table)
|
| 13 |
+
print(title)
|
| 14 |
+
print(table)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
|
| 18 |
+
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
|
| 19 |
+
headers = ["Document Type"]
|
| 20 |
+
for method in methods:
|
| 21 |
+
for score_type in score_types:
|
| 22 |
+
headers.append(f"{method} {score_type}")
|
| 23 |
+
|
| 24 |
+
document_rows = [[k] for k in document_types]
|
| 25 |
+
for i, doc_type in enumerate(document_types):
|
| 26 |
+
for method in methods:
|
| 27 |
+
for score_type in score_types:
|
| 28 |
+
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
|
| 29 |
+
document_rows[i].append(avg_score)
|
| 30 |
+
|
| 31 |
+
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
|
| 32 |
+
|
| 33 |
+
headers = ["Block Type"]
|
| 34 |
+
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
|
| 35 |
+
block_score_types = list(result["averages_by_block_type"][default_method].keys())
|
| 36 |
+
for method in methods:
|
| 37 |
+
for score_type in block_score_types:
|
| 38 |
+
headers.append(f"{method} {score_type}")
|
| 39 |
+
|
| 40 |
+
block_rows = [[k] for k in block_types]
|
| 41 |
+
for i, block_type in enumerate(block_types):
|
| 42 |
+
for method in methods:
|
| 43 |
+
for score_type in block_score_types:
|
| 44 |
+
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
|
| 45 |
+
block_rows[i].append(avg_score)
|
| 46 |
+
|
| 47 |
+
write_table("Block types", block_rows, headers, out_path, "block_types.md")
|
| 48 |
+
|
| 49 |
+
headers = ["Method", "Avg Time"] + score_types
|
| 50 |
+
inference_rows = [[k] for k in methods]
|
| 51 |
+
all_raw_scores = [result["scores"][i] for i in result["scores"]]
|
| 52 |
+
for i, method in enumerate(methods):
|
| 53 |
+
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
|
| 54 |
+
inference_rows[i].append(avg_time)
|
| 55 |
+
for score_type in score_types:
|
| 56 |
+
scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
|
| 57 |
+
avg_score = sum(scores_lst) / max(1, len(scores_lst))
|
| 58 |
+
inference_rows[i].append(avg_score)
|
| 59 |
+
|
| 60 |
+
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
|
| 61 |
+
|
| 62 |
+
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
|
benchmarks/overall/inference.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
import tempfile
|
| 2 |
-
import time
|
| 3 |
-
|
| 4 |
-
from benchmarks.overall.clean import clean_input
|
| 5 |
-
from benchmarks.overall.schema import BlockScores
|
| 6 |
-
from benchmarks.overall.scoring import score_blocks
|
| 7 |
-
from marker.converters.pdf import PdfConverter
|
| 8 |
-
|
| 9 |
-
def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
|
| 10 |
-
block_converter = PdfConverter(
|
| 11 |
-
artifact_dict=marker_models,
|
| 12 |
-
config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
|
| 13 |
-
)
|
| 14 |
-
|
| 15 |
-
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
|
| 16 |
-
f.write(pdf_bytes)
|
| 17 |
-
rendered = block_converter(f.name)
|
| 18 |
-
|
| 19 |
-
return rendered.markdown
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
|
| 23 |
-
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 24 |
-
start = time.time()
|
| 25 |
-
marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
|
| 26 |
-
marker_md_clean = clean_input(marker_md)
|
| 27 |
-
total = time.time() - start
|
| 28 |
-
scores = score_blocks(gt_markdown, marker_md_clean)
|
| 29 |
-
scores["time"] = total
|
| 30 |
-
scores["markdown"] = marker_md
|
| 31 |
-
return scores
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
|
| 35 |
-
uuid = sample["uuid"]
|
| 36 |
-
data = None
|
| 37 |
-
for row in mathpix_ds:
|
| 38 |
-
if str(row["uuid"]) == str(uuid):
|
| 39 |
-
data = row
|
| 40 |
-
break
|
| 41 |
-
if not data:
|
| 42 |
-
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 43 |
-
|
| 44 |
-
mathpix_md_clean = clean_input(data["md"])
|
| 45 |
-
scores = score_blocks(gt_markdown, mathpix_md_clean)
|
| 46 |
-
scores["time"] = data["time"]
|
| 47 |
-
scores["markdown"] = data["md"]
|
| 48 |
-
return scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/overall/methods/__init__.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import re
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
|
| 5 |
+
import markdown2
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from playwright.sync_api import sync_playwright
|
| 8 |
+
|
| 9 |
+
from benchmarks.overall.methods.schema import BenchmarkResult
|
| 10 |
+
from marker.renderers.markdown import MarkdownRenderer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BaseMethod:
|
| 14 |
+
def __init__(self, **kwargs):
|
| 15 |
+
for kwarg in kwargs:
|
| 16 |
+
if hasattr(self, kwarg):
|
| 17 |
+
setattr(self, kwarg, kwargs[kwarg])
|
| 18 |
+
|
| 19 |
+
@staticmethod
|
| 20 |
+
def convert_to_md(html: str):
|
| 21 |
+
md = MarkdownRenderer()
|
| 22 |
+
markdown = md.md_cls.convert(html)
|
| 23 |
+
return markdown
|
| 24 |
+
|
| 25 |
+
def __call__(self, sample) -> BenchmarkResult:
|
| 26 |
+
raise NotImplementedError()
|
| 27 |
+
|
| 28 |
+
def render(self, markdown: str):
|
| 29 |
+
return self.html_to_image(self.convert_to_html(markdown))
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def convert_to_html(md: str):
|
| 33 |
+
block_placeholders = []
|
| 34 |
+
inline_placeholders = []
|
| 35 |
+
|
| 36 |
+
# Add placeholders for the math
|
| 37 |
+
def block_sub(match):
|
| 38 |
+
content = match.group(1)
|
| 39 |
+
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
|
| 40 |
+
block_placeholders.append((placeholder, f"$${content}$$"))
|
| 41 |
+
return placeholder
|
| 42 |
+
|
| 43 |
+
def inline_sub(match):
|
| 44 |
+
content = match.group(1)
|
| 45 |
+
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
|
| 46 |
+
inline_placeholders.append((placeholder, f"${content}$"))
|
| 47 |
+
return placeholder
|
| 48 |
+
|
| 49 |
+
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
|
| 50 |
+
md = re.sub(r'\$(.*?)\$', inline_sub, md)
|
| 51 |
+
|
| 52 |
+
html = markdown2.markdown(md, extras=['tables'])
|
| 53 |
+
|
| 54 |
+
# Replace placeholders
|
| 55 |
+
for placeholder, math_str in block_placeholders:
|
| 56 |
+
html = html.replace(placeholder, math_str)
|
| 57 |
+
for placeholder, math_str in inline_placeholders:
|
| 58 |
+
html = html.replace(placeholder, math_str)
|
| 59 |
+
|
| 60 |
+
return html
|
| 61 |
+
|
| 62 |
+
def html_to_image(self, html: str) -> Image.Image:
|
| 63 |
+
with sync_playwright() as p:
|
| 64 |
+
browser = p.chromium.launch()
|
| 65 |
+
page = browser.new_page()
|
| 66 |
+
page.set_content(f"""
|
| 67 |
+
<head>
|
| 68 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
|
| 69 |
+
<!-- The loading of KaTeX is deferred to speed up page rendering -->
|
| 70 |
+
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
|
| 71 |
+
<!-- To automatically render math in text elements, include the auto-render extension: -->
|
| 72 |
+
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
|
| 73 |
+
</head>
|
| 74 |
+
<body>
|
| 75 |
+
{html}
|
| 76 |
+
<script>
|
| 77 |
+
renderMathInElement(document.body, {{
|
| 78 |
+
delimiters: [
|
| 79 |
+
{{left: '$$', right: '$$', display: true}},
|
| 80 |
+
{{left: '$', right: '$', display: false}}
|
| 81 |
+
]
|
| 82 |
+
}});
|
| 83 |
+
</script>
|
| 84 |
+
</body>
|
| 85 |
+
""")
|
| 86 |
+
page.set_viewport_size({"width": 1200, "height": 800})
|
| 87 |
+
page.wait_for_timeout(500) # Wait for KaTeX to render
|
| 88 |
+
screenshot_bytes = page.screenshot(full_page=True)
|
| 89 |
+
browser.close()
|
| 90 |
+
|
| 91 |
+
return Image.open(io.BytesIO(screenshot_bytes))
|
benchmarks/overall/methods/gt.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class GTMethod(BaseMethod):
|
| 10 |
+
def __call__(self, sample) -> BenchmarkResult:
|
| 11 |
+
gt_blocks = json.loads(sample["gt_blocks"])
|
| 12 |
+
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
|
| 13 |
+
gt_markdown = [self.convert_to_md(block) for block in gt_html]
|
| 14 |
+
return {
|
| 15 |
+
"markdown": gt_markdown,
|
| 16 |
+
"time": 0
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def render(self, html: List[str]) -> Image.Image:
|
| 20 |
+
joined = "\n\n".join(html)
|
| 21 |
+
html = f"""
|
| 22 |
+
<html>
|
| 23 |
+
<head></head>
|
| 24 |
+
<body>
|
| 25 |
+
{joined}
|
| 26 |
+
</body>
|
| 27 |
+
</html>
|
| 28 |
+
""".strip()
|
| 29 |
+
return self.html_to_image(html)
|
benchmarks/overall/methods/marker.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
|
| 5 |
+
from marker.converters.pdf import PdfConverter
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class MarkerMethod(BaseMethod):
|
| 9 |
+
model_dict: dict = None
|
| 10 |
+
use_llm: bool = False
|
| 11 |
+
|
| 12 |
+
def __call__(self, sample) -> BenchmarkResult:
|
| 13 |
+
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 14 |
+
block_converter = PdfConverter(
|
| 15 |
+
artifact_dict=self.model_dict,
|
| 16 |
+
config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm}
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
|
| 20 |
+
f.write(pdf_bytes)
|
| 21 |
+
start = time.time()
|
| 22 |
+
rendered = block_converter(f.name)
|
| 23 |
+
total = time.time() - start
|
| 24 |
+
|
| 25 |
+
return {
|
| 26 |
+
"markdown": rendered.markdown,
|
| 27 |
+
"time": total
|
| 28 |
+
}
|
| 29 |
+
|
benchmarks/overall/methods/mathpix.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datasets
|
| 2 |
+
|
| 3 |
+
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MathpixMethod(BaseMethod):
|
| 7 |
+
mathpix_ds: datasets.Dataset = None
|
| 8 |
+
|
| 9 |
+
def __call__(self, sample) -> BenchmarkResult:
|
| 10 |
+
uuid = sample["uuid"]
|
| 11 |
+
data = None
|
| 12 |
+
for row in self.mathpix_ds:
|
| 13 |
+
if str(row["uuid"]) == str(uuid):
|
| 14 |
+
data = row
|
| 15 |
+
break
|
| 16 |
+
if not data:
|
| 17 |
+
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 18 |
+
|
| 19 |
+
return {
|
| 20 |
+
"markdown": data["md"],
|
| 21 |
+
"time": data["time"]
|
| 22 |
+
}
|
benchmarks/overall/methods/schema.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TypedDict, List
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BenchmarkResult(TypedDict):
|
| 5 |
+
markdown: str | List[str]
|
| 6 |
+
time: float | None
|
benchmarks/overall/overall.py
CHANGED
|
@@ -2,117 +2,86 @@ import json
|
|
| 2 |
import os
|
| 3 |
from collections import defaultdict
|
| 4 |
from pathlib import Path
|
| 5 |
-
from typing import
|
| 6 |
|
| 7 |
import click
|
| 8 |
import datasets
|
| 9 |
-
import tabulate
|
| 10 |
-
from benchmarks.overall.render import build_dataset
|
| 11 |
from tqdm import tqdm
|
| 12 |
-
import pypdfium2 as pdfium
|
| 13 |
|
| 14 |
-
from benchmarks.overall.
|
| 15 |
-
from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
|
| 16 |
from benchmarks.overall.schema import FullResult
|
| 17 |
from marker.logger import configure_logging
|
| 18 |
from marker.models import create_model_dict
|
| 19 |
from marker.settings import settings
|
|
|
|
| 20 |
|
| 21 |
configure_logging()
|
| 22 |
|
| 23 |
|
| 24 |
-
def get_method_scores(
|
| 25 |
bench_scores = {}
|
| 26 |
-
averages_by_type = defaultdict(list)
|
| 27 |
-
averages_by_block_type = defaultdict(list)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
if max_rows is not None and idx >= max_rows:
|
| 30 |
break
|
| 31 |
|
| 32 |
-
gt_blocks = json.loads(sample["gt_blocks"])
|
| 33 |
doc_type = sample["classification"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
|
| 37 |
-
gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
|
| 38 |
-
scores = score_func(model_dict, sample, gt_markdown, **kwargs)
|
| 39 |
-
except ValueError as e:
|
| 40 |
-
print(f"Error with sample {idx}: {e}")
|
| 41 |
-
continue
|
| 42 |
-
except pdfium.PdfiumError as e:
|
| 43 |
-
print(f"Error opening pdf: {e}")
|
| 44 |
-
continue
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
bench_scores[idx] =
|
| 52 |
|
| 53 |
-
avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
|
| 54 |
return {
|
| 55 |
-
"
|
|
|
|
| 56 |
"averages_by_type": averages_by_type,
|
| 57 |
"averages_by_block_type": averages_by_block_type,
|
| 58 |
-
"
|
| 59 |
-
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
|
| 60 |
}
|
| 61 |
|
| 62 |
-
def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
|
| 63 |
-
inference_types = [default_method] + [k for k in scores.keys() if k != default_method]
|
| 64 |
-
|
| 65 |
-
document_types = list(scores[default_method]["averages_by_type"].keys())
|
| 66 |
-
document_rows = [[k] for k in document_types]
|
| 67 |
-
for k in inference_types:
|
| 68 |
-
for i, doc_type in enumerate(document_types):
|
| 69 |
-
avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
|
| 70 |
-
document_rows[i].append(avg)
|
| 71 |
-
|
| 72 |
-
print("Document types")
|
| 73 |
-
document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
|
| 74 |
-
print(document_type_table)
|
| 75 |
-
with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
|
| 76 |
-
f.write(document_type_table)
|
| 77 |
-
|
| 78 |
-
block_types = list(scores[default_method]["averages_by_block_type"].keys())
|
| 79 |
-
block_rows = [[k] for k in block_types]
|
| 80 |
-
for k in inference_types:
|
| 81 |
-
for i, block_type in enumerate(block_types):
|
| 82 |
-
avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
|
| 83 |
-
block_rows[i].append(avg)
|
| 84 |
-
|
| 85 |
-
print("Block types")
|
| 86 |
-
block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
|
| 87 |
-
print(block_type_table)
|
| 88 |
-
with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
|
| 89 |
-
f.write(block_type_table)
|
| 90 |
-
|
| 91 |
-
headers = ["Method", "Avg Score", "Avg Time"]
|
| 92 |
-
inference_rows = [[k] for k in inference_types]
|
| 93 |
-
for i, k in enumerate(inference_types):
|
| 94 |
-
inference_rows[i].append(scores[k]["average_score"])
|
| 95 |
-
inference_rows[i].append(scores[k]["average_time"])
|
| 96 |
-
|
| 97 |
-
print("Overall")
|
| 98 |
-
overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
|
| 99 |
-
print(overall_table)
|
| 100 |
-
with open(out_path / "overall.md", "w", encoding="utf-8") as f:
|
| 101 |
-
f.write(overall_table)
|
| 102 |
-
|
| 103 |
-
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
|
| 104 |
-
|
| 105 |
@click.command(help="Benchmark PDF to MD conversion.")
|
| 106 |
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
|
| 107 |
@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
|
| 108 |
-
@click.option("--
|
|
|
|
| 109 |
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
|
| 110 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
|
| 111 |
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
|
| 112 |
def main(
|
| 113 |
dataset: str,
|
| 114 |
out_dataset: str,
|
| 115 |
-
|
|
|
|
| 116 |
result_path: str,
|
| 117 |
max_rows: int,
|
| 118 |
use_llm: bool
|
|
@@ -120,37 +89,35 @@ def main(
|
|
| 120 |
out_path = Path(result_path)
|
| 121 |
out_path.mkdir(parents=True, exist_ok=True)
|
| 122 |
|
| 123 |
-
|
| 124 |
-
methods = other_methods.split(",")
|
| 125 |
for method in methods:
|
| 126 |
-
if method not in
|
| 127 |
-
raise ValueError(f"Method {method} not allowed. Allowed methods are {
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
}
|
| 136 |
|
| 137 |
-
|
| 138 |
-
mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
|
| 139 |
-
mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
|
| 140 |
-
all_scores["mathpix"] = mathpix_scores
|
| 141 |
-
|
| 142 |
-
# Display formatted score tables
|
| 143 |
-
print_scores(all_scores, out_path)
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
|
|
|
|
| 149 |
|
| 150 |
-
# Push up comparison dataset
|
| 151 |
-
if out_dataset is not None:
|
| 152 |
-
out_ds = build_dataset(ds, all_scores)
|
| 153 |
-
out_ds.push_to_hub(out_dataset)
|
| 154 |
|
| 155 |
if __name__ == "__main__":
|
| 156 |
main()
|
|
|
|
| 2 |
import os
|
| 3 |
from collections import defaultdict
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import List
|
| 6 |
|
| 7 |
import click
|
| 8 |
import datasets
|
|
|
|
|
|
|
| 9 |
from tqdm import tqdm
|
|
|
|
| 10 |
|
| 11 |
+
from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
|
|
|
|
| 12 |
from benchmarks.overall.schema import FullResult
|
| 13 |
from marker.logger import configure_logging
|
| 14 |
from marker.models import create_model_dict
|
| 15 |
from marker.settings import settings
|
| 16 |
+
from benchmarks.overall.display import print_scores
|
| 17 |
|
| 18 |
configure_logging()
|
| 19 |
|
| 20 |
|
| 21 |
+
def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
|
| 22 |
bench_scores = {}
|
| 23 |
+
averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
| 24 |
+
averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
| 25 |
+
average_times = defaultdict(list)
|
| 26 |
+
markdown_by_method = defaultdict(dict)
|
| 27 |
+
for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark"):
|
| 28 |
if max_rows is not None and idx >= max_rows:
|
| 29 |
break
|
| 30 |
|
|
|
|
| 31 |
doc_type = sample["classification"]
|
| 32 |
+
gt_cls = METHOD_REGISTRY["gt"]
|
| 33 |
+
gt_blocks = json.loads(sample["gt_blocks"])
|
| 34 |
+
gt_md = gt_cls(**artifacts)(sample)["markdown"]
|
| 35 |
+
|
| 36 |
+
out_data = defaultdict(dict)
|
| 37 |
+
|
| 38 |
+
for method in methods:
|
| 39 |
+
method_cls = METHOD_REGISTRY[method](**artifacts)
|
| 40 |
+
method_info = method_cls(sample)
|
| 41 |
+
method_md = method_info["markdown"]
|
| 42 |
+
average_times[method].append(method_info["time"])
|
| 43 |
+
markdown_by_method[idx][method] = method_md
|
| 44 |
+
|
| 45 |
+
for score_type in score_types:
|
| 46 |
+
score_cls = SCORE_REGISTRY[score_type]()
|
| 47 |
+
try:
|
| 48 |
+
scores = score_cls(sample, gt_md, method_md)
|
| 49 |
+
except Exception as e:
|
| 50 |
+
# Some scorers can fail, like the LLM one
|
| 51 |
+
print(f"Failed to score {method} with {score_type}: {e}")
|
| 52 |
+
continue
|
| 53 |
|
| 54 |
+
out_data[method][score_type] = scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
averages_by_type[method][score_type][doc_type].append(scores["score"])
|
| 57 |
|
| 58 |
+
if "by_block" in scores["specific_scores"]: # Not all scorers support this
|
| 59 |
+
for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
|
| 60 |
+
averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
|
| 61 |
|
| 62 |
+
bench_scores[idx] = out_data
|
| 63 |
|
|
|
|
| 64 |
return {
|
| 65 |
+
"scores": bench_scores,
|
| 66 |
+
"markdown": markdown_by_method,
|
| 67 |
"averages_by_type": averages_by_type,
|
| 68 |
"averages_by_block_type": averages_by_block_type,
|
| 69 |
+
"average_times": average_times,
|
|
|
|
| 70 |
}
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
@click.command(help="Benchmark PDF to MD conversion.")
|
| 73 |
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
|
| 74 |
@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
|
| 75 |
+
@click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix", default="marker")
|
| 76 |
+
@click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic")
|
| 77 |
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
|
| 78 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
|
| 79 |
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
|
| 80 |
def main(
|
| 81 |
dataset: str,
|
| 82 |
out_dataset: str,
|
| 83 |
+
methods: str,
|
| 84 |
+
scores: str,
|
| 85 |
result_path: str,
|
| 86 |
max_rows: int,
|
| 87 |
use_llm: bool
|
|
|
|
| 89 |
out_path = Path(result_path)
|
| 90 |
out_path.mkdir(parents=True, exist_ok=True)
|
| 91 |
|
| 92 |
+
methods = methods.split(",")
|
|
|
|
| 93 |
for method in methods:
|
| 94 |
+
if method not in METHOD_REGISTRY:
|
| 95 |
+
raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}")
|
| 96 |
+
|
| 97 |
+
# Ensure marker is always first
|
| 98 |
+
methods = list(set(methods))
|
| 99 |
+
methods = ["marker"] + [m for m in methods if m != "marker"]
|
| 100 |
+
|
| 101 |
+
score_types = scores.split(",")
|
| 102 |
+
for score_type in score_types:
|
| 103 |
+
if score_type not in SCORE_REGISTRY:
|
| 104 |
+
raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}")
|
| 105 |
+
|
| 106 |
+
benchmark_dataset = datasets.load_dataset(dataset, split="train")
|
| 107 |
+
artifacts = {
|
| 108 |
+
"model_dict": create_model_dict(),
|
| 109 |
+
"mathpix_ds": datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train"),
|
| 110 |
+
"use_llm": use_llm
|
| 111 |
}
|
| 112 |
|
| 113 |
+
result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# Display benchmark scoring tables
|
| 116 |
+
print_scores(result, out_path, methods, score_types)
|
| 117 |
|
| 118 |
+
with open(out_path / "result.json", "w") as f:
|
| 119 |
+
json.dump(result, f)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
if __name__ == "__main__":
|
| 123 |
main()
|
benchmarks/overall/registry.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from benchmarks.overall.methods.gt import GTMethod
|
| 2 |
+
from benchmarks.overall.methods.marker import MarkerMethod
|
| 3 |
+
from benchmarks.overall.methods.mathpix import MathpixMethod
|
| 4 |
+
from benchmarks.overall.scorers.heuristic import HeuristicScorer
|
| 5 |
+
from benchmarks.overall.scorers.llm import LLMScorer
|
| 6 |
+
|
| 7 |
+
SCORE_REGISTRY = {
|
| 8 |
+
"heuristic": HeuristicScorer,
|
| 9 |
+
"llm": LLMScorer
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
METHOD_REGISTRY = {
|
| 13 |
+
"marker": MarkerMethod,
|
| 14 |
+
"gt": GTMethod,
|
| 15 |
+
"mathpix": MathpixMethod
|
| 16 |
+
}
|
benchmarks/overall/render.py
DELETED
|
@@ -1,117 +0,0 @@
|
|
| 1 |
-
import subprocess
|
| 2 |
-
import tempfile
|
| 3 |
-
import pypdfium2 as pdfium
|
| 4 |
-
from typing import Dict
|
| 5 |
-
from collections import defaultdict
|
| 6 |
-
import re
|
| 7 |
-
import io
|
| 8 |
-
import json
|
| 9 |
-
|
| 10 |
-
from PIL import Image
|
| 11 |
-
import datasets
|
| 12 |
-
import markdown2
|
| 13 |
-
from playwright.sync_api import sync_playwright
|
| 14 |
-
|
| 15 |
-
from benchmarks.overall.clean import convert_to_md, clean_input
|
| 16 |
-
from benchmarks.overall.schema import FullResult
|
| 17 |
-
|
| 18 |
-
def convert_to_html(md: str):
|
| 19 |
-
block_placeholders = []
|
| 20 |
-
inline_placeholders = []
|
| 21 |
-
|
| 22 |
-
# Add placeholders for the math
|
| 23 |
-
def block_sub(match):
|
| 24 |
-
content = match.group(1)
|
| 25 |
-
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
|
| 26 |
-
block_placeholders.append((placeholder, f"$${content}$$"))
|
| 27 |
-
return placeholder
|
| 28 |
-
|
| 29 |
-
def inline_sub(match):
|
| 30 |
-
content = match.group(1)
|
| 31 |
-
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
|
| 32 |
-
inline_placeholders.append((placeholder, f"${content}$"))
|
| 33 |
-
return placeholder
|
| 34 |
-
|
| 35 |
-
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
|
| 36 |
-
md = re.sub(r'\$(.*?)\$', inline_sub, md)
|
| 37 |
-
|
| 38 |
-
html = markdown2.markdown(md, extras=['tables'])
|
| 39 |
-
|
| 40 |
-
# Replace placeholders
|
| 41 |
-
for placeholder, math_str in block_placeholders:
|
| 42 |
-
html = html.replace(placeholder, math_str)
|
| 43 |
-
for placeholder, math_str in inline_placeholders:
|
| 44 |
-
html = html.replace(placeholder, math_str)
|
| 45 |
-
|
| 46 |
-
return html
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def markdown_to_image(md: str) -> Image.Image:
|
| 50 |
-
html = convert_to_html(md)
|
| 51 |
-
with sync_playwright() as p:
|
| 52 |
-
browser = p.chromium.launch()
|
| 53 |
-
page = browser.new_page()
|
| 54 |
-
page.set_content(f"""
|
| 55 |
-
<head>
|
| 56 |
-
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
|
| 57 |
-
<!-- The loading of KaTeX is deferred to speed up page rendering -->
|
| 58 |
-
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
|
| 59 |
-
<!-- To automatically render math in text elements, include the auto-render extension: -->
|
| 60 |
-
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
|
| 61 |
-
</head>
|
| 62 |
-
<body>
|
| 63 |
-
{html}
|
| 64 |
-
<script>
|
| 65 |
-
renderMathInElement(document.body, {{
|
| 66 |
-
delimiters: [
|
| 67 |
-
{{left: '$$', right: '$$', display: true}},
|
| 68 |
-
{{left: '$', right: '$', display: false}}
|
| 69 |
-
]
|
| 70 |
-
}});
|
| 71 |
-
</script>
|
| 72 |
-
</body>
|
| 73 |
-
""")
|
| 74 |
-
page.set_viewport_size({"width": 1200, "height": 800})
|
| 75 |
-
page.wait_for_timeout(500) # Wait for KaTeX to render
|
| 76 |
-
screenshot_bytes = page.screenshot(full_page=True)
|
| 77 |
-
browser.close()
|
| 78 |
-
|
| 79 |
-
return Image.open(io.BytesIO(screenshot_bytes))
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
|
| 83 |
-
# Get all the dataset indices that went through inference
|
| 84 |
-
full_idxs = None
|
| 85 |
-
for method in all_scores:
|
| 86 |
-
result_idxs = list(all_scores[method]["raw_scores"].keys())
|
| 87 |
-
if full_idxs is None:
|
| 88 |
-
full_idxs = sorted(result_idxs)
|
| 89 |
-
else:
|
| 90 |
-
full_idxs = [f for f in full_idxs if f in result_idxs]
|
| 91 |
-
|
| 92 |
-
ds_rows = defaultdict(dict)
|
| 93 |
-
for idx in full_idxs:
|
| 94 |
-
row = ds[idx]
|
| 95 |
-
ds_rows[idx].update({
|
| 96 |
-
"img": row["img"],
|
| 97 |
-
"classification": row["classification"],
|
| 98 |
-
"language": row["language"],
|
| 99 |
-
"uuid": row["uuid"]
|
| 100 |
-
})
|
| 101 |
-
for method in all_scores:
|
| 102 |
-
method_row = all_scores[method]["raw_scores"][idx]
|
| 103 |
-
ds_rows[idx].update({
|
| 104 |
-
f"{method}_score": method_row["overall_score"],
|
| 105 |
-
f"{method}_markdown": method_row["markdown"],
|
| 106 |
-
f"{method}_image": markdown_to_image(method_row["markdown"]),
|
| 107 |
-
f"{method}_time": method_row["time"]
|
| 108 |
-
})
|
| 109 |
-
gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
|
| 110 |
-
gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
|
| 111 |
-
ds_rows[idx].update({
|
| 112 |
-
"gt_markdown": gt_md,
|
| 113 |
-
"gt_markdown_image": markdown_to_image(gt_md)
|
| 114 |
-
})
|
| 115 |
-
out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
|
| 116 |
-
return out_dataset
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/overall/schema.py
CHANGED
|
@@ -1,18 +1,12 @@
|
|
| 1 |
-
from typing import TypedDict, List, Dict
|
| 2 |
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
scores: List[float]
|
| 6 |
-
order_score: float
|
| 7 |
-
overall_score: float
|
| 8 |
-
time: Optional[float]
|
| 9 |
-
markdown: str
|
| 10 |
-
|
| 11 |
|
| 12 |
class FullResult(TypedDict):
|
| 13 |
-
|
| 14 |
-
averages_by_type:
|
| 15 |
-
averages_by_block_type:
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
gt_markdown: List[str]
|
|
|
|
| 1 |
+
from typing import TypedDict, List, Dict
|
| 2 |
|
| 3 |
+
from benchmarks.overall.scorers.schema import BlockScores
|
| 4 |
|
| 5 |
+
AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class FullResult(TypedDict):
|
| 8 |
+
scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
|
| 9 |
+
averages_by_type: AVG_TYPE
|
| 10 |
+
averages_by_block_type: AVG_TYPE
|
| 11 |
+
average_times: Dict[str, List[float]]
|
| 12 |
+
markdown: Dict[int, Dict[str, str]]
|
|
|
benchmarks/overall/scorers/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from benchmarks.overall.scorers.schema import BlockScores
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class BaseScorer:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
|
| 11 |
+
raise NotImplementedError()
|
benchmarks/overall/{clean.py → scorers/clean.py}
RENAMED
|
@@ -5,8 +5,6 @@ from pathlib import Path
|
|
| 5 |
|
| 6 |
import latex2mathml.converter
|
| 7 |
|
| 8 |
-
from marker.renderers.markdown import MarkdownRenderer
|
| 9 |
-
|
| 10 |
class MarkdownCleaner:
|
| 11 |
def __init__(self):
|
| 12 |
pass
|
|
@@ -112,14 +110,4 @@ class MarkdownCleaner:
|
|
| 112 |
return latex_str
|
| 113 |
|
| 114 |
|
| 115 |
-
def convert_to_md(html):
|
| 116 |
-
md = MarkdownRenderer()
|
| 117 |
-
markdown = md.md_cls.convert(html)
|
| 118 |
-
return markdown
|
| 119 |
-
|
| 120 |
-
def clean_input(markdown):
|
| 121 |
-
cleaner = MarkdownCleaner()
|
| 122 |
-
return cleaner(markdown)
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
|
|
|
| 5 |
|
| 6 |
import latex2mathml.converter
|
| 7 |
|
|
|
|
|
|
|
| 8 |
class MarkdownCleaner:
|
| 9 |
def __init__(self):
|
| 10 |
pass
|
|
|
|
| 110 |
return latex_str
|
| 111 |
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
benchmarks/overall/scorers/heuristic.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from rapidfuzz import fuzz
|
| 4 |
+
|
| 5 |
+
from benchmarks.overall.scorers.clean import MarkdownCleaner
|
| 6 |
+
from benchmarks.overall.scorers.schema import BlockScores
|
| 7 |
+
from benchmarks.overall.scorers import BaseScorer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class HeuristicScorer(BaseScorer):
|
| 11 |
+
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
|
| 12 |
+
# Standardize inputs
|
| 13 |
+
gt_markdown = [self.clean_input(block) for block in gt_markdown]
|
| 14 |
+
method_markdown = self.clean_input(method_markdown)
|
| 15 |
+
|
| 16 |
+
alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
|
| 17 |
+
scores = [alignment["score"] for alignment in alignments]
|
| 18 |
+
|
| 19 |
+
# Find order score
|
| 20 |
+
orders = [alignment["start"] for alignment in alignments]
|
| 21 |
+
correct_order = list(range(len(gt_markdown)))
|
| 22 |
+
actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
|
| 23 |
+
order_score = self.kendall_tau(correct_order, actual_order)
|
| 24 |
+
|
| 25 |
+
# Weight score by sequence length
|
| 26 |
+
gt_weights = [len(g) for g in gt_markdown]
|
| 27 |
+
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
|
| 28 |
+
|
| 29 |
+
# Weight the score by sequence length
|
| 30 |
+
overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
|
| 31 |
+
overall_score = overall_score * 0.8 + order_score * 0.2
|
| 32 |
+
return {
|
| 33 |
+
"score": overall_score,
|
| 34 |
+
"specific_scores": {
|
| 35 |
+
"order": order_score,
|
| 36 |
+
"by_block": scores
|
| 37 |
+
},
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
|
| 42 |
+
n = len(correct_order)
|
| 43 |
+
concordant = 0
|
| 44 |
+
discordant = 0
|
| 45 |
+
|
| 46 |
+
if n <= 1:
|
| 47 |
+
return 100
|
| 48 |
+
|
| 49 |
+
for i in range(n):
|
| 50 |
+
for j in range(i + 1, n):
|
| 51 |
+
correct_sign = correct_order[i] - correct_order[j]
|
| 52 |
+
actual_sign = actual_order[i] - actual_order[j]
|
| 53 |
+
|
| 54 |
+
if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
|
| 55 |
+
concordant += 1
|
| 56 |
+
elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
|
| 57 |
+
discordant += 1
|
| 58 |
+
|
| 59 |
+
total_pairs = (n * (n - 1)) // 2
|
| 60 |
+
tau = (concordant - discordant) / total_pairs
|
| 61 |
+
tau = (tau + 1) / 2 # 0-1 scale
|
| 62 |
+
return tau * 100 # 0-100 scale
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def find_fuzzy_alignments(
|
| 66 |
+
main_string: str,
|
| 67 |
+
substrings: List[str],
|
| 68 |
+
threshold: int = 70
|
| 69 |
+
) -> List[dict]:
|
| 70 |
+
alignments = []
|
| 71 |
+
|
| 72 |
+
for idx, substr in enumerate(substrings):
|
| 73 |
+
result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
|
| 74 |
+
|
| 75 |
+
score = 0
|
| 76 |
+
dest_start = 0
|
| 77 |
+
dest_end = 0
|
| 78 |
+
if result:
|
| 79 |
+
score = result.score
|
| 80 |
+
dest_start = result.dest_start
|
| 81 |
+
dest_end = result.dest_end
|
| 82 |
+
|
| 83 |
+
alignments.append({
|
| 84 |
+
"string": substr,
|
| 85 |
+
"start": dest_start,
|
| 86 |
+
"end": dest_end,
|
| 87 |
+
"score": score,
|
| 88 |
+
"idx": idx
|
| 89 |
+
})
|
| 90 |
+
return alignments
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@staticmethod
|
| 94 |
+
def clean_input(md: str):
|
| 95 |
+
cleaner = MarkdownCleaner()
|
| 96 |
+
return cleaner(md)
|
benchmarks/overall/scorers/llm.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import tempfile
|
| 3 |
+
import time
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from google.ai.generativelanguage_v1beta.types import content
|
| 8 |
+
from google.api_core.exceptions import ResourceExhausted
|
| 9 |
+
import pypdfium2 as pdfium
|
| 10 |
+
|
| 11 |
+
from benchmarks.overall.scorers import BaseScorer, BlockScores
|
| 12 |
+
from marker.settings import settings
|
| 13 |
+
|
| 14 |
+
rating_prompt = """
|
| 15 |
+
You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
|
| 16 |
+
You're given an image, along with the extracted markdown:
|
| 17 |
+
- Some parts of the page may have been recognized as images and linked from the markdown, like ``.
|
| 18 |
+
- Tables will be formatted as Github flavored markdown.
|
| 19 |
+
- Block equations will be in LaTeX.
|
| 20 |
+
- The image and markdown may be in any language.
|
| 21 |
+
- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
|
| 22 |
+
|
| 23 |
+
The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
|
| 24 |
+
|
| 25 |
+
**Instructions**
|
| 26 |
+
Follow this process to evaluate the markdown:
|
| 27 |
+
1. Carefully examine the image.
|
| 28 |
+
2. Carefully examine the markdown input provided.
|
| 29 |
+
3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image?
|
| 30 |
+
4. Assign component scores, as described below.
|
| 31 |
+
|
| 32 |
+
These are the primary scores:
|
| 33 |
+
- Overall - the overall quality of the markdown as compared to the image.
|
| 34 |
+
- Text quality - the quality of the text extraction from the image.
|
| 35 |
+
- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
|
| 36 |
+
|
| 37 |
+
Depending on which elements are present in the markdown, you will assign element-specific scores.
|
| 38 |
+
- Tables - how effectively the tables have been extracted and formatted.
|
| 39 |
+
- Forms - how effectively the forms have extracted and formatted.
|
| 40 |
+
- Equations - how effectively block equations have been converted to LaTeX.
|
| 41 |
+
- Section headers - if all of the section headers have been detected, and the right levels set.
|
| 42 |
+
- Lists - if the lists have been properly extracted and formatted.
|
| 43 |
+
- Images - if images are identified and placed correctly.
|
| 44 |
+
|
| 45 |
+
Notes on scoring:
|
| 46 |
+
- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
|
| 47 |
+
- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
|
| 48 |
+
- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
|
| 49 |
+
- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
|
| 50 |
+
|
| 51 |
+
Output json, like in the example below.
|
| 52 |
+
|
| 53 |
+
**Example**
|
| 54 |
+
Input
|
| 55 |
+
```markdown
|
| 56 |
+
# Section 1
|
| 57 |
+
This is some *markdown* extracted from a document. Here is a block equation:
|
| 58 |
+
$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
|
| 59 |
+
```
|
| 60 |
+
Output
|
| 61 |
+
```json
|
| 62 |
+
{
|
| 63 |
+
"image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
|
| 64 |
+
"markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
|
| 65 |
+
"comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.",
|
| 66 |
+
"overall": 5,
|
| 67 |
+
"text": 5,
|
| 68 |
+
"formatting": 5,
|
| 69 |
+
"section_headers": 5,
|
| 70 |
+
"tables": 0,
|
| 71 |
+
"forms": 0,
|
| 72 |
+
"equations": 5,
|
| 73 |
+
"lists": 0,
|
| 74 |
+
"images": 0
|
| 75 |
+
}
|
| 76 |
+
```
|
| 77 |
+
**Input**
|
| 78 |
+
```markdown
|
| 79 |
+
{{markdown}}
|
| 80 |
+
```
|
| 81 |
+
**Output**
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
comparison_keys = ["comparison"]
|
| 85 |
+
description_keys = ["image_description", "markdown_description"]
|
| 86 |
+
text_keys = comparison_keys + description_keys
|
| 87 |
+
score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
|
| 88 |
+
"lists", "images"]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class LLMScorer(BaseScorer):
|
| 92 |
+
def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
|
| 93 |
+
pdf_bytes = sample["pdf"]
|
| 94 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
| 95 |
+
f.write(pdf_bytes)
|
| 96 |
+
f.flush()
|
| 97 |
+
f.seek(0)
|
| 98 |
+
doc = pdfium.PdfDocument(f.name)
|
| 99 |
+
img = doc[0].render(scale=96/72).to_pil()
|
| 100 |
+
doc.close()
|
| 101 |
+
|
| 102 |
+
return self.llm_rater(img, markdown)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
|
| 106 |
+
req_keys = text_keys + score_keys
|
| 107 |
+
properties = {}
|
| 108 |
+
for key in req_keys:
|
| 109 |
+
content_type = content.Type.INTEGER if key in score_keys else content.Type.STRING
|
| 110 |
+
properties[key] = content.Schema(type=content_type)
|
| 111 |
+
|
| 112 |
+
response_schema = content.Schema(
|
| 113 |
+
type=content.Type.OBJECT,
|
| 114 |
+
required=req_keys,
|
| 115 |
+
properties=properties
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
prompt = rating_prompt.replace("{{markdown}}", markdown)
|
| 119 |
+
response = self.llm_response_wrapper([img, prompt], response_schema)
|
| 120 |
+
assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
|
| 121 |
+
return {
|
| 122 |
+
"score": response["overall"],
|
| 123 |
+
"specific_scores": response,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def llm_response_wrapper(self, prompt, response_schema, depth=0):
|
| 127 |
+
import google.generativeai as genai
|
| 128 |
+
genai.configure(api_key=settings.GOOGLE_API_KEY)
|
| 129 |
+
gemini_model = genai.GenerativeModel("gemini-1.5-flash")
|
| 130 |
+
try:
|
| 131 |
+
responses = gemini_model.generate_content(
|
| 132 |
+
prompt,
|
| 133 |
+
stream=False,
|
| 134 |
+
generation_config={
|
| 135 |
+
"temperature": 0,
|
| 136 |
+
"response_schema": response_schema,
|
| 137 |
+
"response_mime_type": "application/json",
|
| 138 |
+
},
|
| 139 |
+
request_options={'timeout': 60}
|
| 140 |
+
)
|
| 141 |
+
output = responses.candidates[0].content.parts[0].text
|
| 142 |
+
return json.loads(output)
|
| 143 |
+
except ResourceExhausted as e:
|
| 144 |
+
print(f"Hit Gemini rate limit, waiting 120 seconds")
|
| 145 |
+
time.sleep(120)
|
| 146 |
+
if depth > 2:
|
| 147 |
+
raise e
|
| 148 |
+
return self.llm_response_wrapper(prompt, response_schema, depth + 1)
|
benchmarks/overall/scorers/schema.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TypedDict, List, Optional, Dict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BlockScores(TypedDict):
|
| 5 |
+
score: float
|
| 6 |
+
specific_scores: Dict[str, float | List[float]]
|
benchmarks/overall/scoring.py
DELETED
|
@@ -1,83 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from rapidfuzz import fuzz
|
| 4 |
-
|
| 5 |
-
from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
|
| 6 |
-
from benchmarks.overall.schema import BlockScores
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
|
| 10 |
-
n = len(correct_order)
|
| 11 |
-
concordant = 0
|
| 12 |
-
discordant = 0
|
| 13 |
-
|
| 14 |
-
if n <= 1:
|
| 15 |
-
return 100
|
| 16 |
-
|
| 17 |
-
for i in range(n):
|
| 18 |
-
for j in range(i + 1, n):
|
| 19 |
-
correct_sign = correct_order[i] - correct_order[j]
|
| 20 |
-
actual_sign = actual_order[i] - actual_order[j]
|
| 21 |
-
|
| 22 |
-
if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
|
| 23 |
-
concordant += 1
|
| 24 |
-
elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
|
| 25 |
-
discordant += 1
|
| 26 |
-
|
| 27 |
-
total_pairs = (n * (n - 1)) // 2
|
| 28 |
-
tau = (concordant - discordant) / total_pairs
|
| 29 |
-
tau = (tau + 1) / 2 # 0-1 scale
|
| 30 |
-
return tau * 100 # 0-100 scale
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def find_fuzzy_alignments(
|
| 34 |
-
main_string: str,
|
| 35 |
-
substrings: List[str],
|
| 36 |
-
threshold: int = 70
|
| 37 |
-
) -> List[dict]:
|
| 38 |
-
alignments = []
|
| 39 |
-
|
| 40 |
-
for idx, substr in enumerate(substrings):
|
| 41 |
-
result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
|
| 42 |
-
|
| 43 |
-
score = 0
|
| 44 |
-
dest_start = 0
|
| 45 |
-
dest_end = 0
|
| 46 |
-
if result:
|
| 47 |
-
score = result.score
|
| 48 |
-
dest_start = result.dest_start
|
| 49 |
-
dest_end = result.dest_end
|
| 50 |
-
|
| 51 |
-
alignments.append({
|
| 52 |
-
"string": substr,
|
| 53 |
-
"start": dest_start,
|
| 54 |
-
"end": dest_end,
|
| 55 |
-
"score": score,
|
| 56 |
-
"idx": idx
|
| 57 |
-
})
|
| 58 |
-
return alignments
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
|
| 62 |
-
alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
|
| 63 |
-
scores = [alignment["score"] for alignment in alignments]
|
| 64 |
-
|
| 65 |
-
# Find order score
|
| 66 |
-
orders = [alignment["start"] for alignment in alignments]
|
| 67 |
-
correct_order = list(range(len(gt_markdown)))
|
| 68 |
-
actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
|
| 69 |
-
order_score = kendall_tau(correct_order, actual_order)
|
| 70 |
-
|
| 71 |
-
# Weight score by sequence length
|
| 72 |
-
gt_weights = [len(g) for g in gt_markdown]
|
| 73 |
-
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
|
| 74 |
-
|
| 75 |
-
# Weight the score by sequence length
|
| 76 |
-
overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
|
| 77 |
-
overall_score = overall_score * 0.8 + order_score * 0.2
|
| 78 |
-
return {
|
| 79 |
-
"scores": scores,
|
| 80 |
-
"order_score": order_score,
|
| 81 |
-
"overall_score": overall_score,
|
| 82 |
-
"time": None
|
| 83 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|