Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Feb 4

Commit

a19295b

1 Parent(s): 2bcf51e

Clean up benchmark, make more pluggable

Browse files

Files changed (18) hide show

benchmarks/__init__.py +0 -0
benchmarks/overall/display/__init__.py +62 -0
benchmarks/overall/inference.py +0 -48
benchmarks/overall/methods/__init__.py +91 -0
benchmarks/overall/methods/gt.py +29 -0
benchmarks/overall/methods/marker.py +29 -0
benchmarks/overall/methods/mathpix.py +22 -0
benchmarks/overall/methods/schema.py +6 -0
benchmarks/overall/overall.py +66 -99
benchmarks/overall/registry.py +16 -0
benchmarks/overall/render.py +0 -117
benchmarks/overall/schema.py +8 -14
benchmarks/overall/scorers/__init__.py +11 -0
benchmarks/overall/{clean.py → scorers/clean.py} +0 -12
benchmarks/overall/scorers/heuristic.py +96 -0
benchmarks/overall/scorers/llm.py +148 -0
benchmarks/overall/scorers/schema.py +6 -0
benchmarks/overall/scoring.py +0 -83

benchmarks/__init__.py ADDED Viewed

File without changes

benchmarks/overall/display/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from pathlib import Path
+from typing import Dict, List
+import tabulate
+from benchmarks.overall.schema import FullResult
+def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
+    table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
+    with open(out_path / filename, "w", encoding="utf-8") as f:
+        f.write(f"# {title}\n")
+        f.write(table)
+    print(title)
+    print(table)
+def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
+    document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
+    headers = ["Document Type"]
+    for method in methods:
+        for score_type in score_types:
+            headers.append(f"{method} {score_type}")
+    document_rows = [[k] for k in document_types]
+    for i, doc_type in enumerate(document_types):
+        for method in methods:
+            for score_type in score_types:
+                avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
+                document_rows[i].append(avg_score)
+    write_table("Document Types", document_rows, headers, out_path, "document_types.md")
+    headers = ["Block Type"]
+    block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
+    block_score_types = list(result["averages_by_block_type"][default_method].keys())
+    for method in methods:
+        for score_type in block_score_types:
+            headers.append(f"{method} {score_type}")
+    block_rows = [[k] for k in block_types]
+    for i, block_type in enumerate(block_types):
+        for method in methods:
+            for score_type in block_score_types:
+                avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
+                block_rows[i].append(avg_score)
+    write_table("Block types", block_rows, headers, out_path, "block_types.md")
+    headers = ["Method",  "Avg Time"] + score_types
+    inference_rows = [[k] for k in methods]
+    all_raw_scores = [result["scores"][i] for i in result["scores"]]
+    for i, method in enumerate(methods):
+        avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
+        inference_rows[i].append(avg_time)
+        for score_type in score_types:
+            scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
+            avg_score = sum(scores_lst) / max(1, len(scores_lst))
+            inference_rows[i].append(avg_score)
+    write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
+    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")

benchmarks/overall/inference.py DELETED Viewed

@@ -1,48 +0,0 @@
-import tempfile
-import time
-from benchmarks.overall.clean import clean_input
-from benchmarks.overall.schema import BlockScores
-from benchmarks.overall.scoring import score_blocks
-from marker.converters.pdf import PdfConverter
-def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
-    block_converter = PdfConverter(
-        artifact_dict=marker_models,
-        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
-    )
-    with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
-        f.write(pdf_bytes)
-        rendered = block_converter(f.name)
-    return rendered.markdown
-def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
-    pdf_bytes = sample["pdf"]  # This is a single page PDF
-    start = time.time()
-    marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
-    marker_md_clean = clean_input(marker_md)
-    total = time.time() - start
-    scores = score_blocks(gt_markdown, marker_md_clean)
-    scores["time"] = total
-    scores["markdown"] = marker_md
-    return scores
-def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
-    uuid = sample["uuid"]
-    data = None
-    for row in mathpix_ds:
-        if str(row["uuid"]) == str(uuid):
-            data = row
-            break
-    if not data:
-        raise ValueError(f"Could not find data for uuid {uuid}")
-    mathpix_md_clean = clean_input(data["md"])
-    scores = score_blocks(gt_markdown, mathpix_md_clean)
-    scores["time"] = data["time"]
-    scores["markdown"] = data["md"]
-    return scores

benchmarks/overall/methods/__init__.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import io
+import re
+from typing import Tuple
+import markdown2
+from PIL import Image
+from playwright.sync_api import sync_playwright
+from benchmarks.overall.methods.schema import BenchmarkResult
+from marker.renderers.markdown import MarkdownRenderer
+class BaseMethod:
+    def __init__(self, **kwargs):
+        for kwarg in kwargs:
+            if hasattr(self, kwarg):
+                setattr(self, kwarg, kwargs[kwarg])
+    @staticmethod
+    def convert_to_md(html: str):
+        md = MarkdownRenderer()
+        markdown = md.md_cls.convert(html)
+        return markdown
+    def __call__(self, sample) -> BenchmarkResult:
+        raise NotImplementedError()
+    def render(self, markdown: str):
+        return self.html_to_image(self.convert_to_html(markdown))
+    @staticmethod
+    def convert_to_html(md: str):
+        block_placeholders = []
+        inline_placeholders = []
+        # Add placeholders for the math
+        def block_sub(match):
+            content = match.group(1)
+            placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
+            block_placeholders.append((placeholder, f"$${content}$$"))
+            return placeholder
+        def inline_sub(match):
+            content = match.group(1)
+            placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
+            inline_placeholders.append((placeholder, f"${content}$"))
+            return placeholder
+        md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
+        md = re.sub(r'\$(.*?)\$', inline_sub, md)
+        html = markdown2.markdown(md, extras=['tables'])
+        # Replace placeholders
+        for placeholder, math_str in block_placeholders:
+            html = html.replace(placeholder, math_str)
+        for placeholder, math_str in inline_placeholders:
+            html = html.replace(placeholder, math_str)
+        return html
+    def html_to_image(self, html: str) -> Image.Image:
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+            page = browser.new_page()
+            page.set_content(f"""
+                <head>
+                    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
+                    <!-- The loading of KaTeX is deferred to speed up page rendering -->
+                    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
+                    <!-- To automatically render math in text elements, include the auto-render extension: -->
+                    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
+                </head>
+                <body>
+                    {html}
+                        <script>
+                            renderMathInElement(document.body, {{
+                                delimiters: [
+                                    {{left: '$$', right: '$$', display: true}},
+                                    {{left: '$', right: '$', display: false}}
+                                ]
+                            }});
+                        </script>
+                </body>
+            """)
+            page.set_viewport_size({"width": 1200, "height": 800})
+            page.wait_for_timeout(500)  # Wait for KaTeX to render
+            screenshot_bytes = page.screenshot(full_page=True)
+            browser.close()
+        return Image.open(io.BytesIO(screenshot_bytes))

benchmarks/overall/methods/gt.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List
+import json
+from PIL import Image
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+class GTMethod(BaseMethod):
+    def __call__(self, sample) -> BenchmarkResult:
+        gt_blocks = json.loads(sample["gt_blocks"])
+        gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
+        gt_markdown = [self.convert_to_md(block) for block in gt_html]
+        return {
+            "markdown": gt_markdown,
+            "time": 0
+        }
+    def render(self, html: List[str]) -> Image.Image:
+        joined = "\n\n".join(html)
+        html = f"""
+<html>
+<head></head>
+<body>
+{joined}
+</body>
+</html>
+""".strip()
+        return self.html_to_image(html)

benchmarks/overall/methods/marker.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import tempfile
+import time
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+from marker.converters.pdf import PdfConverter
+class MarkerMethod(BaseMethod):
+    model_dict: dict = None
+    use_llm: bool = False
+    def __call__(self, sample) -> BenchmarkResult:
+        pdf_bytes = sample["pdf"]  # This is a single page PDF
+        block_converter = PdfConverter(
+            artifact_dict=self.model_dict,
+            config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm}
+        )
+        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+            f.write(pdf_bytes)
+            start = time.time()
+            rendered = block_converter(f.name)
+            total = time.time() - start
+        return {
+            "markdown": rendered.markdown,
+            "time": total
+        }

benchmarks/overall/methods/mathpix.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import datasets
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+class MathpixMethod(BaseMethod):
+    mathpix_ds: datasets.Dataset = None
+    def __call__(self, sample) -> BenchmarkResult:
+        uuid = sample["uuid"]
+        data = None
+        for row in self.mathpix_ds:
+            if str(row["uuid"]) == str(uuid):
+                data = row
+                break
+        if not data:
+            raise ValueError(f"Could not find data for uuid {uuid}")
+        return {
+            "markdown": data["md"],
+            "time": data["time"]
+        }

benchmarks/overall/methods/schema.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from typing import TypedDict, List
+class BenchmarkResult(TypedDict):
+    markdown: str | List[str]
+    time: float | None

benchmarks/overall/overall.py CHANGED Viewed

@@ -2,117 +2,86 @@ import json
 import os
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict
 import click
 import datasets
-import tabulate
-from benchmarks.overall.render import build_dataset
 from tqdm import tqdm
-import pypdfium2 as pdfium
-from benchmarks.overall.clean import convert_to_md, clean_input
-from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
 from marker.settings import settings
 configure_logging()
-def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult:
     bench_scores = {}
-    averages_by_type = defaultdict(list)
-    averages_by_block_type = defaultdict(list)
-    for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
         if max_rows is not None and idx >= max_rows:
             break
-        gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
-        try:
-            gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
-            gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
-            scores = score_func(model_dict, sample, gt_markdown, **kwargs)
-        except ValueError as e:
-            print(f"Error with sample {idx}: {e}")
-            continue
-        except pdfium.PdfiumError as e:
-            print(f"Error opening pdf: {e}")
-            continue
-        averages_by_type[doc_type].append(scores["overall_score"])
-        for score, gt_block in zip(scores["scores"], gt_blocks):
-            averages_by_block_type[gt_block["block_type"]].append(score)
-        bench_scores[idx] = scores
-    avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
     return {
-        "raw_scores": bench_scores,
         "averages_by_type": averages_by_type,
         "averages_by_block_type": averages_by_block_type,
-        "average_time": avg_time,
-        "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
     }
-def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
-    inference_types = [default_method] + [k for k in scores.keys() if k != default_method]
-    document_types = list(scores[default_method]["averages_by_type"].keys())
-    document_rows = [[k] for k in document_types]
-    for k in inference_types:
-        for i, doc_type in enumerate(document_types):
-            avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
-            document_rows[i].append(avg)
-    print("Document types")
-    document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
-    print(document_type_table)
-    with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
-        f.write(document_type_table)
-    block_types = list(scores[default_method]["averages_by_block_type"].keys())
-    block_rows = [[k] for k in block_types]
-    for k in inference_types:
-        for i, block_type in enumerate(block_types):
-            avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
-            block_rows[i].append(avg)
-    print("Block types")
-    block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
-    print(block_type_table)
-    with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
-        f.write(block_type_table)
-    headers = ["Method", "Avg Score", "Avg Time"]
-    inference_rows = [[k] for k in inference_types]
-    for i, k in enumerate(inference_types):
-        inference_rows[i].append(scores[k]["average_score"])
-        inference_rows[i].append(scores[k]["average_time"])
-    print("Overall")
-    overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
-    print(overall_table)
-    with open(out_path / "overall.md", "w", encoding="utf-8") as f:
-        f.write(overall_table)
-    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
-@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
         out_dataset: str,
-        other_methods: str,
         result_path: str,
         max_rows: int,
         use_llm: bool
@@ -120,37 +89,35 @@ def main(
     out_path = Path(result_path)
     out_path.mkdir(parents=True, exist_ok=True)
-    allowed_methods = ["mathpix", ""]
-    methods = other_methods.split(",")
     for method in methods:
-        if method not in allowed_methods:
-            raise ValueError(f"Method {method} not allowed.  Allowed methods are {allowed_methods}")
-    model_dict = create_model_dict()
-    ds = datasets.load_dataset(dataset, split="train")
-    marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm)
-    all_scores = {
-        "marker": marker_scores
     }
-    if "mathpix" in methods:
-        mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
-        mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
-        all_scores["mathpix"] = mathpix_scores
-    # Display formatted score tables
-    print_scores(all_scores, out_path)
-    with open(out_path / "overall.json", "w", encoding="utf-8") as f:
-        json.dump(all_scores, f, indent=2, ensure_ascii=False)
-    print(f"Results saved to {out_path}.")
-    # Push up comparison dataset
-    if out_dataset is not None:
-        out_ds = build_dataset(ds, all_scores)
-        out_ds.push_to_hub(out_dataset)
 if __name__ == "__main__":
     main()

 import os
 from collections import defaultdict
 from pathlib import Path
+from typing import List
 import click
 import datasets
 from tqdm import tqdm
+from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
 from marker.settings import settings
+from benchmarks.overall.display import print_scores
 configure_logging()
+def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
     bench_scores = {}
+    averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    average_times = defaultdict(list)
+    markdown_by_method = defaultdict(dict)
+    for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark"):
         if max_rows is not None and idx >= max_rows:
             break
         doc_type = sample["classification"]
+        gt_cls = METHOD_REGISTRY["gt"]
+        gt_blocks = json.loads(sample["gt_blocks"])
+        gt_md = gt_cls(**artifacts)(sample)["markdown"]
+        out_data = defaultdict(dict)
+        for method in methods:
+            method_cls = METHOD_REGISTRY[method](**artifacts)
+            method_info = method_cls(sample)
+            method_md = method_info["markdown"]
+            average_times[method].append(method_info["time"])
+            markdown_by_method[idx][method] = method_md
+            for score_type in score_types:
+                score_cls = SCORE_REGISTRY[score_type]()
+                try:
+                    scores = score_cls(sample, gt_md, method_md)
+                except Exception as e:
+                    # Some scorers can fail, like the LLM one
+                    print(f"Failed to score {method} with {score_type}: {e}")
+                    continue
+                out_data[method][score_type] = scores
+                averages_by_type[method][score_type][doc_type].append(scores["score"])
+                if "by_block" in scores["specific_scores"]: # Not all scorers support this
+                    for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
+                        averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
+        bench_scores[idx] = out_data
     return {
+        "scores": bench_scores,
+        "markdown": markdown_by_method,
         "averages_by_type": averages_by_type,
         "averages_by_block_type": averages_by_block_type,
+        "average_times": average_times,
     }
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
+@click.option("--methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: marker,mathpix", default="marker")
+@click.option("--scores", type=str, help="Comma separated list of scoring functions to use.  Possible values: heuristic,llm", default="heuristic")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
         out_dataset: str,
+        methods: str,
+        scores: str,
         result_path: str,
         max_rows: int,
         use_llm: bool
     out_path = Path(result_path)
     out_path.mkdir(parents=True, exist_ok=True)
+    methods = methods.split(",")
     for method in methods:
+        if method not in METHOD_REGISTRY:
+            raise ValueError(f"Method {method} not allowed.  Allowed methods are {METHOD_REGISTRY.keys()}")
+    # Ensure marker is always first
+    methods = list(set(methods))
+    methods = ["marker"] + [m for m in methods if m != "marker"]
+    score_types = scores.split(",")
+    for score_type in score_types:
+        if score_type not in SCORE_REGISTRY:
+            raise ValueError(f"Score type {score_type} not allowed.  Allowed types are {SCORE_REGISTRY.keys()}")
+    benchmark_dataset = datasets.load_dataset(dataset, split="train")
+    artifacts = {
+        "model_dict": create_model_dict(),
+        "mathpix_ds": datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train"),
+        "use_llm": use_llm
     }
+    result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
+    # Display benchmark scoring tables
+    print_scores(result, out_path, methods, score_types)
+    with open(out_path / "result.json", "w") as f:
+        json.dump(result, f)
 if __name__ == "__main__":
     main()

benchmarks/overall/registry.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from benchmarks.overall.methods.gt import GTMethod
+from benchmarks.overall.methods.marker import MarkerMethod
+from benchmarks.overall.methods.mathpix import MathpixMethod
+from benchmarks.overall.scorers.heuristic import HeuristicScorer
+from benchmarks.overall.scorers.llm import LLMScorer
+SCORE_REGISTRY = {
+    "heuristic": HeuristicScorer,
+    "llm": LLMScorer
+}
+METHOD_REGISTRY = {
+    "marker": MarkerMethod,
+    "gt": GTMethod,
+    "mathpix": MathpixMethod
+}

benchmarks/overall/render.py DELETED Viewed

@@ -1,117 +0,0 @@
-import subprocess
-import tempfile
-import pypdfium2 as pdfium
-from typing import Dict
-from collections import defaultdict
-import re
-import io
-import json
-from PIL import Image
-import datasets
-import markdown2
-from playwright.sync_api import sync_playwright
-from benchmarks.overall.clean import convert_to_md, clean_input
-from benchmarks.overall.schema import FullResult
-def convert_to_html(md: str):
-    block_placeholders = []
-    inline_placeholders = []
-    # Add placeholders for the math
-    def block_sub(match):
-        content = match.group(1)
-        placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
-        block_placeholders.append((placeholder, f"$${content}$$"))
-        return placeholder
-    def inline_sub(match):
-        content = match.group(1)
-        placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
-        inline_placeholders.append((placeholder, f"${content}$"))
-        return placeholder
-    md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
-    md = re.sub(r'\$(.*?)\$', inline_sub, md)
-    html = markdown2.markdown(md, extras=['tables'])
-    # Replace placeholders
-    for placeholder, math_str in block_placeholders:
-        html = html.replace(placeholder, math_str)
-    for placeholder, math_str in inline_placeholders:
-        html = html.replace(placeholder, math_str)
-    return html
-def markdown_to_image(md: str) -> Image.Image:
-    html = convert_to_html(md)
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-        page = browser.new_page()
-        page.set_content(f"""
-            <head>
-                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
-                <!-- The loading of KaTeX is deferred to speed up page rendering -->
-                <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
-                <!-- To automatically render math in text elements, include the auto-render extension: -->
-                <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
-            </head>
-            <body>
-                {html}
-                    <script>
-                        renderMathInElement(document.body, {{
-                            delimiters: [
-                                {{left: '$$', right: '$$', display: true}},
-                                {{left: '$', right: '$', display: false}}
-                            ]
-                        }});
-                    </script>
-            </body>
-        """)
-        page.set_viewport_size({"width": 1200, "height": 800})
-        page.wait_for_timeout(500) # Wait for KaTeX to render
-        screenshot_bytes = page.screenshot(full_page=True)
-        browser.close()
-    return Image.open(io.BytesIO(screenshot_bytes))
-def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
-    # Get all the dataset indices that went through inference
-    full_idxs = None
-    for method in all_scores:
-        result_idxs = list(all_scores[method]["raw_scores"].keys())
-        if full_idxs is None:
-            full_idxs = sorted(result_idxs)
-        else:
-            full_idxs = [f for f in full_idxs if f in result_idxs]
-    ds_rows = defaultdict(dict)
-    for idx in full_idxs:
-        row = ds[idx]
-        ds_rows[idx].update({
-            "img": row["img"],
-            "classification": row["classification"],
-            "language": row["language"],
-            "uuid": row["uuid"]
-        })
-        for method in all_scores:
-            method_row = all_scores[method]["raw_scores"][idx]
-            ds_rows[idx].update({
-                f"{method}_score": method_row["overall_score"],
-                f"{method}_markdown": method_row["markdown"],
-                f"{method}_image": markdown_to_image(method_row["markdown"]),
-                f"{method}_time": method_row["time"]
-            })
-        gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
-        gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
-        ds_rows[idx].update({
-            "gt_markdown": gt_md,
-            "gt_markdown_image": markdown_to_image(gt_md)
-        })
-    out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
-    return out_dataset

benchmarks/overall/schema.py CHANGED Viewed

@@ -1,18 +1,12 @@
-from typing import TypedDict, List, Dict, Optional
-class BlockScores(TypedDict):
-    scores: List[float]
-    order_score: float
-    overall_score: float
-    time: Optional[float]
-    markdown: str
 class FullResult(TypedDict):
-    raw_scores: Dict[int, BlockScores]
-    averages_by_type: Dict[str, List[float]]
-    averages_by_block_type: Dict[str, List[float]]
-    average_time: float
-    average_score: float
-    gt_markdown: List[str]

+from typing import TypedDict, List, Dict
+from benchmarks.overall.scorers.schema import BlockScores
+AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
 class FullResult(TypedDict):
+    scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
+    averages_by_type: AVG_TYPE
+    averages_by_block_type: AVG_TYPE
+    average_times: Dict[str, List[float]]
+    markdown: Dict[int, Dict[str, str]]

benchmarks/overall/scorers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from typing import List
+from benchmarks.overall.scorers.schema import BlockScores
+class BaseScorer:
+    def __init__(self):
+        pass
+    def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
+        raise NotImplementedError()

benchmarks/overall/{clean.py → scorers/clean.py} RENAMED Viewed

@@ -5,8 +5,6 @@ from pathlib import Path
 import latex2mathml.converter
-from marker.renderers.markdown import MarkdownRenderer
 class MarkdownCleaner:
     def __init__(self):
         pass
@@ -112,14 +110,4 @@ class MarkdownCleaner:
         return latex_str
-def convert_to_md(html):
-    md = MarkdownRenderer()
-    markdown = md.md_cls.convert(html)
-    return markdown
-def clean_input(markdown):
-    cleaner = MarkdownCleaner()
-    return cleaner(markdown)

 import latex2mathml.converter
 class MarkdownCleaner:
     def __init__(self):
         pass
         return latex_str

benchmarks/overall/scorers/heuristic.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import List
+from rapidfuzz import fuzz
+from benchmarks.overall.scorers.clean import MarkdownCleaner
+from benchmarks.overall.scorers.schema import BlockScores
+from benchmarks.overall.scorers import BaseScorer
+class HeuristicScorer(BaseScorer):
+    def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
+        # Standardize inputs
+        gt_markdown = [self.clean_input(block) for block in gt_markdown]
+        method_markdown = self.clean_input(method_markdown)
+        alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
+        scores = [alignment["score"] for alignment in alignments]
+        # Find order score
+        orders = [alignment["start"] for alignment in alignments]
+        correct_order = list(range(len(gt_markdown)))
+        actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
+        order_score = self.kendall_tau(correct_order, actual_order)
+        # Weight score by sequence length
+        gt_weights = [len(g) for g in gt_markdown]
+        weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
+        # Weight the score by sequence length
+        overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
+        overall_score = overall_score * 0.8 + order_score * 0.2
+        return {
+            "score": overall_score,
+            "specific_scores": {
+                "order": order_score,
+                "by_block": scores
+            },
+        }
+    @staticmethod
+    def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
+        n = len(correct_order)
+        concordant = 0
+        discordant = 0
+        if n <= 1:
+            return 100
+        for i in range(n):
+            for j in range(i + 1, n):
+                correct_sign = correct_order[i] - correct_order[j]
+                actual_sign = actual_order[i] - actual_order[j]
+                if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
+                    concordant += 1
+                elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
+                    discordant += 1
+        total_pairs = (n * (n - 1)) // 2
+        tau = (concordant - discordant) / total_pairs
+        tau = (tau + 1) / 2 # 0-1 scale
+        return tau * 100 # 0-100 scale
+    @staticmethod
+    def find_fuzzy_alignments(
+            main_string: str,
+            substrings: List[str],
+            threshold: int = 70
+    ) -> List[dict]:
+        alignments = []
+        for idx, substr in enumerate(substrings):
+            result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
+            score = 0
+            dest_start = 0
+            dest_end = 0
+            if result:
+                score = result.score
+                dest_start = result.dest_start
+                dest_end = result.dest_end
+            alignments.append({
+                "string": substr,
+                "start": dest_start,
+                "end": dest_end,
+                "score": score,
+                "idx": idx
+            })
+        return alignments
+    @staticmethod
+    def clean_input(md: str):
+        cleaner = MarkdownCleaner()
+        return cleaner(md)

benchmarks/overall/scorers/llm.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import json
+import tempfile
+import time
+from typing import List
+from PIL import Image
+from google.ai.generativelanguage_v1beta.types import content
+from google.api_core.exceptions import ResourceExhausted
+import pypdfium2 as pdfium
+from benchmarks.overall.scorers import BaseScorer, BlockScores
+from marker.settings import settings
+rating_prompt = """
+You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
+You're given an image, along with the extracted markdown:
+- Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
+- Tables will be formatted as Github flavored markdown.
+- Block equations will be in LaTeX.
+- The image and markdown may be in any language.
+- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
+The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
+**Instructions**
+Follow this process to evaluate the markdown:
+1. Carefully examine the image.
+2. Carefully examine the markdown input provided.
+3. Compare the image to the markdown representation.  Does the markdown representation properly represent the important text and formatting in the image?
+4. Assign component scores, as described below.
+These are the primary scores:
+- Overall - the overall quality of the markdown as compared to the image.
+- Text quality - the quality of the text extraction from the image.
+- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
+Depending on which elements are present in the markdown, you will assign element-specific scores.
+- Tables - how effectively the tables have been extracted and formatted.
+- Forms - how effectively the forms have extracted and formatted.
+- Equations - how effectively block equations have been converted to LaTeX.
+- Section headers - if all of the section headers have been detected, and the right levels set.
+- Lists - if the lists have been properly extracted and formatted.
+- Images - if images are identified and placed correctly.
+Notes on scoring:
+- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
+- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
+- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
+- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
+Output json, like in the example below.
+**Example**
+Input
+```markdown
+# Section 1
+This is some *markdown* extracted from a document.  Here is a block equation:
+$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
+```
+Output
+```json
+{
+    "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
+    "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
+    "comparison": "The text and formatting matches the image.  There are no formatting or text extraction issues.  The equations and section headers are correct.",
+    "overall": 5,
+    "text": 5,
+    "formatting": 5,
+    "section_headers": 5,
+	"tables": 0,
+	"forms": 0,
+    "equations": 5,
+	"lists": 0,
+	"images": 0
+}
+```
+**Input**
+```markdown
+{{markdown}}
+```
+**Output**
+"""
+comparison_keys = ["comparison"]
+description_keys = ["image_description", "markdown_description"]
+text_keys = comparison_keys + description_keys
+score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
+            "lists", "images"]
+class LLMScorer(BaseScorer):
+    def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
+        pdf_bytes = sample["pdf"]
+        with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
+            f.write(pdf_bytes)
+            f.flush()
+            f.seek(0)
+            doc = pdfium.PdfDocument(f.name)
+            img = doc[0].render(scale=96/72).to_pil()
+            doc.close()
+        return self.llm_rater(img, markdown)
+    def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
+        req_keys = text_keys + score_keys
+        properties = {}
+        for key in req_keys:
+            content_type = content.Type.INTEGER if key in score_keys else content.Type.STRING
+            properties[key] = content.Schema(type=content_type)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            required=req_keys,
+            properties=properties
+        )
+        prompt = rating_prompt.replace("{{markdown}}", markdown)
+        response = self.llm_response_wrapper([img, prompt], response_schema)
+        assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
+        return {
+            "score": response["overall"],
+            "specific_scores": response,
+        }
+    def llm_response_wrapper(self, prompt, response_schema, depth=0):
+        import google.generativeai as genai
+        genai.configure(api_key=settings.GOOGLE_API_KEY)
+        gemini_model = genai.GenerativeModel("gemini-1.5-flash")
+        try:
+            responses = gemini_model.generate_content(
+                prompt,
+                stream=False,
+                generation_config={
+                    "temperature": 0,
+                    "response_schema": response_schema,
+                    "response_mime_type": "application/json",
+                },
+                request_options={'timeout': 60}
+            )
+            output = responses.candidates[0].content.parts[0].text
+            return json.loads(output)
+        except ResourceExhausted as e:
+            print(f"Hit Gemini rate limit, waiting 120 seconds")
+            time.sleep(120)
+            if depth > 2:
+                raise e
+            return self.llm_response_wrapper(prompt, response_schema, depth + 1)

benchmarks/overall/scorers/schema.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from typing import TypedDict, List, Optional, Dict
+class BlockScores(TypedDict):
+    score: float
+    specific_scores: Dict[str, float | List[float]]

benchmarks/overall/scoring.py DELETED Viewed

@@ -1,83 +0,0 @@
-from typing import List
-from rapidfuzz import fuzz
-from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
-from benchmarks.overall.schema import BlockScores
-def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
-    n = len(correct_order)
-    concordant = 0
-    discordant = 0
-    if n <= 1:
-        return 100
-    for i in range(n):
-        for j in range(i + 1, n):
-            correct_sign = correct_order[i] - correct_order[j]
-            actual_sign = actual_order[i] - actual_order[j]
-            if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
-                concordant += 1
-            elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
-                discordant += 1
-    total_pairs = (n * (n - 1)) // 2
-    tau = (concordant - discordant) / total_pairs
-    tau = (tau + 1) / 2 # 0-1 scale
-    return tau * 100 # 0-100 scale
-def find_fuzzy_alignments(
-        main_string: str,
-        substrings: List[str],
-        threshold: int = 70
-) -> List[dict]:
-    alignments = []
-    for idx, substr in enumerate(substrings):
-        result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
-        score = 0
-        dest_start = 0
-        dest_end = 0
-        if result:
-            score = result.score
-            dest_start = result.dest_start
-            dest_end = result.dest_end
-        alignments.append({
-            "string": substr,
-            "start": dest_start,
-            "end": dest_end,
-            "score": score,
-            "idx": idx
-        })
-    return alignments
-def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
-    alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
-    scores = [alignment["score"] for alignment in alignments]
-    # Find order score
-    orders = [alignment["start"] for alignment in alignments]
-    correct_order = list(range(len(gt_markdown)))
-    actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
-    order_score = kendall_tau(correct_order, actual_order)
-    # Weight score by sequence length
-    gt_weights = [len(g) for g in gt_markdown]
-    weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
-    # Weight the score by sequence length
-    overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
-    overall_score = overall_score * 0.8 + order_score * 0.2
-    return {
-        "scores": scores,
-        "order_score": order_score,
-        "overall_score": overall_score,
-        "time": None
-    }