Vik Paruchuri
commited on
Commit
·
49af83e
1
Parent(s):
31a680a
Update overall benchmark
Browse files- benchmarks/overall.py +0 -132
- benchmarks/overall/inference.py +47 -0
- benchmarks/overall/overall.py +88 -0
- benchmarks/overall/scoring.py +30 -0
- benchmarks/scoring.py +0 -36
- benchmarks/table/table.py +12 -10
- marker/builders/llm_layout.py +5 -1
- marker/processors/equation.py +5 -4
- marker/processors/llm/__init__.py +5 -1
- marker/processors/llm/llm_table_merge.py +5 -1
- poetry.lock +62 -53
- pyproject.toml +1 -1
benchmarks/overall.py
DELETED
|
@@ -1,132 +0,0 @@
|
|
| 1 |
-
import tempfile
|
| 2 |
-
import time
|
| 3 |
-
from collections import defaultdict
|
| 4 |
-
|
| 5 |
-
import click
|
| 6 |
-
from tqdm import tqdm
|
| 7 |
-
import pypdfium2 as pdfium
|
| 8 |
-
|
| 9 |
-
from marker.config.parser import ConfigParser
|
| 10 |
-
from marker.converters.pdf import PdfConverter
|
| 11 |
-
from marker.logger import configure_logging
|
| 12 |
-
from marker.models import create_model_dict
|
| 13 |
-
from pdftext.extraction import plain_text_output
|
| 14 |
-
import json
|
| 15 |
-
import os
|
| 16 |
-
import subprocess
|
| 17 |
-
import shutil
|
| 18 |
-
from tabulate import tabulate
|
| 19 |
-
|
| 20 |
-
from marker.settings import settings
|
| 21 |
-
from scoring import score_text
|
| 22 |
-
|
| 23 |
-
configure_logging()
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def nougat_prediction(pdf_filename, batch_size=1):
|
| 27 |
-
out_dir = tempfile.mkdtemp()
|
| 28 |
-
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
|
| 29 |
-
md_file = os.listdir(out_dir)[0]
|
| 30 |
-
with open(os.path.join(out_dir, md_file), "r") as f:
|
| 31 |
-
data = f.read()
|
| 32 |
-
shutil.rmtree(out_dir)
|
| 33 |
-
return data
|
| 34 |
-
|
| 35 |
-
@click.command(help="Benchmark PDF to MD conversion.")
|
| 36 |
-
@click.argument("in_folder", type=str)
|
| 37 |
-
@click.argument("reference_folder", type=str)
|
| 38 |
-
@click.argument("out_file", type=str)
|
| 39 |
-
@click.option("--nougat", is_flag=True, help="Run nougat and compare")
|
| 40 |
-
@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
|
| 41 |
-
def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
|
| 42 |
-
methods = ["marker"]
|
| 43 |
-
if nougat:
|
| 44 |
-
methods.append("nougat")
|
| 45 |
-
|
| 46 |
-
model_dict = create_model_dict()
|
| 47 |
-
|
| 48 |
-
scores = defaultdict(dict)
|
| 49 |
-
benchmark_files = os.listdir(in_folder)
|
| 50 |
-
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
|
| 51 |
-
times = defaultdict(dict)
|
| 52 |
-
pages = defaultdict(int)
|
| 53 |
-
|
| 54 |
-
for idx, fname in tqdm(enumerate(benchmark_files)):
|
| 55 |
-
md_filename = fname.rsplit(".", 1)[0] + ".md"
|
| 56 |
-
|
| 57 |
-
reference_filename = os.path.join(reference_folder, md_filename)
|
| 58 |
-
with open(reference_filename, "r") as f:
|
| 59 |
-
reference = f.read()
|
| 60 |
-
|
| 61 |
-
pdf_filename = os.path.join(in_folder, fname)
|
| 62 |
-
doc = pdfium.PdfDocument(pdf_filename)
|
| 63 |
-
pages[fname] = len(doc)
|
| 64 |
-
|
| 65 |
-
config_parser = ConfigParser({"output_format": "markdown"})
|
| 66 |
-
for method in methods:
|
| 67 |
-
start = time.time()
|
| 68 |
-
if method == "marker":
|
| 69 |
-
converter = PdfConverter(
|
| 70 |
-
config=config_parser.generate_config_dict(),
|
| 71 |
-
artifact_dict=model_dict,
|
| 72 |
-
processor_list=None,
|
| 73 |
-
renderer=config_parser.get_renderer()
|
| 74 |
-
)
|
| 75 |
-
full_text = converter(pdf_filename).markdown
|
| 76 |
-
elif method == "nougat":
|
| 77 |
-
full_text = nougat_prediction(pdf_filename, batch_size=1)
|
| 78 |
-
elif method == "naive":
|
| 79 |
-
full_text = plain_text_output(doc, workers=1)
|
| 80 |
-
else:
|
| 81 |
-
raise ValueError(f"Unknown method {method}")
|
| 82 |
-
|
| 83 |
-
times[method][fname] = time.time() - start
|
| 84 |
-
|
| 85 |
-
score = score_text(full_text, reference)
|
| 86 |
-
scores[method][fname] = score
|
| 87 |
-
|
| 88 |
-
if md_out_path:
|
| 89 |
-
md_out_filename = f"{method}_{md_filename}"
|
| 90 |
-
with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
|
| 91 |
-
f.write(full_text)
|
| 92 |
-
|
| 93 |
-
total_pages = sum(pages.values())
|
| 94 |
-
with open(out_file, "w+") as f:
|
| 95 |
-
write_data = defaultdict(dict)
|
| 96 |
-
for method in methods:
|
| 97 |
-
total_time = sum(times[method].values())
|
| 98 |
-
file_stats = {
|
| 99 |
-
fname:
|
| 100 |
-
{
|
| 101 |
-
"time": times[method][fname],
|
| 102 |
-
"score": scores[method][fname],
|
| 103 |
-
"pages": pages[fname]
|
| 104 |
-
}
|
| 105 |
-
|
| 106 |
-
for fname in benchmark_files
|
| 107 |
-
}
|
| 108 |
-
write_data[method] = {
|
| 109 |
-
"files": file_stats,
|
| 110 |
-
"avg_score": sum(scores[method].values()) / len(scores[method]),
|
| 111 |
-
"time_per_page": total_time / total_pages,
|
| 112 |
-
"time_per_doc": total_time / len(scores[method])
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
json.dump(write_data, f, indent=4)
|
| 116 |
-
|
| 117 |
-
summary_table = []
|
| 118 |
-
score_table = []
|
| 119 |
-
score_headers = benchmark_files
|
| 120 |
-
for method in methods:
|
| 121 |
-
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
|
| 122 |
-
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
|
| 123 |
-
|
| 124 |
-
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
|
| 125 |
-
print("")
|
| 126 |
-
print("Scores by file")
|
| 127 |
-
print(tabulate(score_table, headers=["Method", *score_headers]))
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
if __name__ == "__main__":
|
| 131 |
-
main()
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/overall/inference.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import fitz as pymupdf
|
| 4 |
+
import tempfile
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
from marker.converters.pdf import PdfConverter
|
| 8 |
+
|
| 9 |
+
def open_pymupdf(pdf_bytes):
|
| 10 |
+
stream = io.BytesIO(pdf_bytes)
|
| 11 |
+
return pymupdf.open(stream=stream)
|
| 12 |
+
|
| 13 |
+
def clip_pdf_to_bbox(doc, bbox, padding=1):
|
| 14 |
+
page = doc[0]
|
| 15 |
+
height, width = page.bound().height, page.bound().width
|
| 16 |
+
remove_left = [0, 0, bbox[0] - padding, height]
|
| 17 |
+
remove_top = [0, 0, width, bbox[1] - padding]
|
| 18 |
+
remove_right = [bbox[2] + padding, 0, width, height]
|
| 19 |
+
remove_bottom = [0, bbox[3] + padding, width, height]
|
| 20 |
+
for remove in [remove_left, remove_top, remove_right, remove_bottom]:
|
| 21 |
+
clip_rect = pymupdf.Rect(*remove)
|
| 22 |
+
page.add_redact_annot(clip_rect)
|
| 23 |
+
page.apply_redactions()
|
| 24 |
+
|
| 25 |
+
clip_rect = pymupdf.Rect(*bbox)
|
| 26 |
+
page.set_cropbox(clip_rect)
|
| 27 |
+
return doc
|
| 28 |
+
|
| 29 |
+
def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes):
|
| 30 |
+
block_html = []
|
| 31 |
+
for block in gt_blocks:
|
| 32 |
+
bbox = block["bbox"]
|
| 33 |
+
doc2 = open_pymupdf(pdf_bytes)
|
| 34 |
+
clip_pdf_to_bbox(doc2, bbox)
|
| 35 |
+
block_converter = PdfConverter(
|
| 36 |
+
artifact_dict=marker_models,
|
| 37 |
+
config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True},
|
| 38 |
+
renderer="marker.renderers.html.HTMLRenderer"
|
| 39 |
+
)
|
| 40 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
|
| 41 |
+
doc2.save(f)
|
| 42 |
+
rendered = block_converter(f.name)
|
| 43 |
+
html = rendered.html
|
| 44 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 45 |
+
inner_html = str(soup.find("body").decode_contents())
|
| 46 |
+
block_html.append(inner_html)
|
| 47 |
+
return block_html
|
benchmarks/overall/overall.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import click
|
| 7 |
+
import datasets
|
| 8 |
+
import tabulate
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
from marker.logger import configure_logging
|
| 12 |
+
from marker.models import create_model_dict
|
| 13 |
+
from inference import get_marker_block_html
|
| 14 |
+
from marker.settings import settings
|
| 15 |
+
from scoring import score_blocks
|
| 16 |
+
|
| 17 |
+
configure_logging()
|
| 18 |
+
|
| 19 |
+
@click.command(help="Benchmark PDF to MD conversion.")
|
| 20 |
+
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
|
| 21 |
+
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values:", default="")
|
| 22 |
+
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
|
| 23 |
+
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
|
| 24 |
+
def main(
|
| 25 |
+
dataset: str,
|
| 26 |
+
other_methods: str,
|
| 27 |
+
result_path: str,
|
| 28 |
+
max_rows: int
|
| 29 |
+
):
|
| 30 |
+
allowed_methods = [""]
|
| 31 |
+
methods = other_methods.split(",")
|
| 32 |
+
for method in methods:
|
| 33 |
+
if method not in allowed_methods:
|
| 34 |
+
raise ValueError(f"Method {method} not allowed. Allowed methods are {allowed_methods}")
|
| 35 |
+
|
| 36 |
+
model_dict = create_model_dict()
|
| 37 |
+
ds = datasets.load_dataset(dataset, split="train")
|
| 38 |
+
|
| 39 |
+
bench_scores = {}
|
| 40 |
+
averages_by_type = defaultdict(list)
|
| 41 |
+
averages_by_block_type = defaultdict(list)
|
| 42 |
+
for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
|
| 43 |
+
gt_blocks = json.loads(sample["gt_blocks"])
|
| 44 |
+
doc_type = sample["classification"]
|
| 45 |
+
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 46 |
+
marker_html = get_marker_block_html(model_dict, gt_blocks, pdf_bytes)
|
| 47 |
+
gt_html = [block["html"] for block in gt_blocks]
|
| 48 |
+
scores = score_blocks(gt_html, marker_html)
|
| 49 |
+
gt_weights = [len(ht) for ht in gt_html]
|
| 50 |
+
overall_score = sum([s * w for s, w in zip(scores, gt_weights)]) / sum(gt_weights)
|
| 51 |
+
bench_scores[idx] = {
|
| 52 |
+
"scores": scores,
|
| 53 |
+
"weights": gt_weights,
|
| 54 |
+
"overall_score": overall_score # Weighted score, weighted by length of GT block
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
averages_by_type[doc_type].append(overall_score)
|
| 58 |
+
|
| 59 |
+
for score, gt_block in zip(scores, gt_blocks):
|
| 60 |
+
averages_by_block_type[gt_block["block_type"]].append(score)
|
| 61 |
+
|
| 62 |
+
if max_rows is not None and idx >= max_rows:
|
| 63 |
+
break
|
| 64 |
+
|
| 65 |
+
for k in averages_by_type:
|
| 66 |
+
averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
|
| 67 |
+
averages_by_type = sorted(averages_by_type.items())
|
| 68 |
+
|
| 69 |
+
print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))
|
| 70 |
+
|
| 71 |
+
for k in averages_by_block_type:
|
| 72 |
+
averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k])
|
| 73 |
+
averages_by_block_type = sorted(averages_by_block_type.items())
|
| 74 |
+
|
| 75 |
+
print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github"))
|
| 76 |
+
|
| 77 |
+
overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
|
| 78 |
+
print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
|
| 79 |
+
|
| 80 |
+
out_path = Path(result_path) / "overall.json"
|
| 81 |
+
with open(out_path, "w") as f:
|
| 82 |
+
json.dump(bench_scores, f, indent=2)
|
| 83 |
+
|
| 84 |
+
print(f"Results saved to {out_path}.")
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
main()
|
| 88 |
+
|
benchmarks/overall/scoring.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
|
| 4 |
+
from markdownify import markdownify as md
|
| 5 |
+
from rapidfuzz import fuzz
|
| 6 |
+
|
| 7 |
+
def standardize_html(html):
|
| 8 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 9 |
+
|
| 10 |
+
# Convert all headers to h1 so we don't penalize small differences in header levels
|
| 11 |
+
for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
|
| 12 |
+
tag.name = "h1"
|
| 13 |
+
|
| 14 |
+
html = str(soup)
|
| 15 |
+
markdown = md(html)
|
| 16 |
+
markdown = markdown.replace("<br>", "\n")
|
| 17 |
+
markdown = re.sub(r"\s+", " ", markdown)
|
| 18 |
+
markdown = re.sub(r"\n+", "\n", markdown)
|
| 19 |
+
markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
|
| 20 |
+
return markdown.strip()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def score_blocks(gt_html, method_html):
|
| 24 |
+
scores = []
|
| 25 |
+
for gt, method in zip(gt_html, method_html):
|
| 26 |
+
gt= standardize_html(gt)
|
| 27 |
+
method = standardize_html(method)
|
| 28 |
+
score = fuzz.ratio(gt, method)
|
| 29 |
+
scores.append(score)
|
| 30 |
+
return scores
|
benchmarks/scoring.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
from rapidfuzz import fuzz
|
| 2 |
-
from statistics import mean
|
| 3 |
-
|
| 4 |
-
CHUNK_MIN_CHARS = 25
|
| 5 |
-
|
| 6 |
-
def chunk_text(text, chunk_len=500):
|
| 7 |
-
chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
|
| 8 |
-
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
|
| 9 |
-
return chunks
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def overlap_score(hypothesis_chunks, reference_chunks):
|
| 13 |
-
length_modifier = len(hypothesis_chunks) / len(reference_chunks)
|
| 14 |
-
search_distance = max(len(reference_chunks) // 5, 10)
|
| 15 |
-
chunk_scores = []
|
| 16 |
-
for i, hyp_chunk in enumerate(hypothesis_chunks):
|
| 17 |
-
max_score = 0
|
| 18 |
-
total_len = 0
|
| 19 |
-
i_offset = int(i * length_modifier)
|
| 20 |
-
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
|
| 21 |
-
for j in chunk_range:
|
| 22 |
-
ref_chunk = reference_chunks[j]
|
| 23 |
-
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
|
| 24 |
-
if score > max_score:
|
| 25 |
-
max_score = score
|
| 26 |
-
total_len = len(ref_chunk)
|
| 27 |
-
chunk_scores.append(max_score)
|
| 28 |
-
return chunk_scores
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def score_text(hypothesis, reference):
|
| 32 |
-
# Returns a 0-1 alignment score
|
| 33 |
-
hypothesis_chunks = chunk_text(hypothesis)
|
| 34 |
-
reference_chunks = chunk_text(reference)
|
| 35 |
-
chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
|
| 36 |
-
return mean(chunk_scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/table/table.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
-
from tkinter import Image
|
| 4 |
-
|
| 5 |
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
|
| 6 |
|
|
|
|
|
|
|
| 7 |
from typing import List
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import base64
|
| 10 |
import time
|
|
@@ -20,6 +20,7 @@ from pypdfium2._helpers.misc import PdfiumError
|
|
| 20 |
import pypdfium2 as pdfium
|
| 21 |
from marker.util import matrix_intersection_area
|
| 22 |
from marker.renderers.json import JSONOutput, JSONBlockOutput
|
|
|
|
| 23 |
|
| 24 |
from marker.config.parser import ConfigParser
|
| 25 |
from marker.converters.table import TableConverter
|
|
@@ -47,7 +48,7 @@ def extract_tables(children: List[JSONBlockOutput]):
|
|
| 47 |
|
| 48 |
|
| 49 |
@click.command(help="Benchmark Table to HTML Conversion")
|
| 50 |
-
@click.
|
| 51 |
@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
|
| 52 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
|
| 53 |
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
|
|
@@ -55,7 +56,7 @@ def extract_tables(children: List[JSONBlockOutput]):
|
|
| 55 |
@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
|
| 56 |
@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
|
| 57 |
def main(
|
| 58 |
-
|
| 59 |
dataset: str,
|
| 60 |
max_rows: int,
|
| 61 |
max_workers: int,
|
|
@@ -64,7 +65,7 @@ def main(
|
|
| 64 |
use_gemini: bool = False
|
| 65 |
):
|
| 66 |
models = create_model_dict()
|
| 67 |
-
config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
|
| 68 |
start = time.time()
|
| 69 |
|
| 70 |
|
|
@@ -93,9 +94,7 @@ def main(
|
|
| 93 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
|
| 94 |
temp_pdf_file.write(pdf_binary)
|
| 95 |
temp_pdf_file.seek(0)
|
| 96 |
-
tqdm.disable = True
|
| 97 |
marker_json = converter(temp_pdf_file.name).children
|
| 98 |
-
tqdm.disable = False
|
| 99 |
|
| 100 |
doc = pdfium.PdfDocument(temp_pdf_file.name)
|
| 101 |
page_image = doc[0].render(scale=92/72).to_pil()
|
|
@@ -223,8 +222,11 @@ def main(
|
|
| 223 |
"gemini": gemini_results
|
| 224 |
}
|
| 225 |
|
| 226 |
-
|
|
|
|
| 227 |
json.dump(results, f, indent=2)
|
| 228 |
|
|
|
|
|
|
|
| 229 |
if __name__ == '__main__':
|
| 230 |
main()
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from itertools import repeat
|
| 6 |
from typing import List
|
| 7 |
+
|
| 8 |
import numpy as np
|
| 9 |
import base64
|
| 10 |
import time
|
|
|
|
| 20 |
import pypdfium2 as pdfium
|
| 21 |
from marker.util import matrix_intersection_area
|
| 22 |
from marker.renderers.json import JSONOutput, JSONBlockOutput
|
| 23 |
+
from marker.settings import settings
|
| 24 |
|
| 25 |
from marker.config.parser import ConfigParser
|
| 26 |
from marker.converters.table import TableConverter
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
@click.command(help="Benchmark Table to HTML Conversion")
|
| 51 |
+
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
|
| 52 |
@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
|
| 53 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
|
| 54 |
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
|
|
|
|
| 56 |
@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
|
| 57 |
@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
|
| 58 |
def main(
|
| 59 |
+
result_path: str,
|
| 60 |
dataset: str,
|
| 61 |
max_rows: int,
|
| 62 |
max_workers: int,
|
|
|
|
| 65 |
use_gemini: bool = False
|
| 66 |
):
|
| 67 |
models = create_model_dict()
|
| 68 |
+
config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
|
| 69 |
start = time.time()
|
| 70 |
|
| 71 |
|
|
|
|
| 94 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
|
| 95 |
temp_pdf_file.write(pdf_binary)
|
| 96 |
temp_pdf_file.seek(0)
|
|
|
|
| 97 |
marker_json = converter(temp_pdf_file.name).children
|
|
|
|
| 98 |
|
| 99 |
doc = pdfium.PdfDocument(temp_pdf_file.name)
|
| 100 |
page_image = doc[0].render(scale=92/72).to_pil()
|
|
|
|
| 222 |
"gemini": gemini_results
|
| 223 |
}
|
| 224 |
|
| 225 |
+
out_path = Path(result_path) / "table.json"
|
| 226 |
+
with open(out_path, "w+") as f:
|
| 227 |
json.dump(results, f, indent=2)
|
| 228 |
|
| 229 |
+
print(f"Results saved to {out_path}.")
|
| 230 |
+
|
| 231 |
if __name__ == '__main__':
|
| 232 |
main()
|
marker/builders/llm_layout.py
CHANGED
|
@@ -50,6 +50,10 @@ class LLMLayoutBuilder(LayoutBuilder):
|
|
| 50 |
int,
|
| 51 |
"The timeout for requests to the Gemini model.",
|
| 52 |
] = 60
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
topk_relabelling_prompt: Annotated[
|
| 54 |
str,
|
| 55 |
"The prompt to use for relabelling blocks.",
|
|
@@ -107,7 +111,7 @@ Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form
|
|
| 107 |
print(f"Error relabelling blocks: {e}")
|
| 108 |
|
| 109 |
def relabel_blocks(self, document: Document):
|
| 110 |
-
pbar = tqdm(desc="LLM layout relabelling")
|
| 111 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 112 |
futures = []
|
| 113 |
for page in document.pages:
|
|
|
|
| 50 |
int,
|
| 51 |
"The timeout for requests to the Gemini model.",
|
| 52 |
] = 60
|
| 53 |
+
disable_tqdm: Annotated[
|
| 54 |
+
bool,
|
| 55 |
+
"Whether to disable the tqdm progress bar.",
|
| 56 |
+
] = False
|
| 57 |
topk_relabelling_prompt: Annotated[
|
| 58 |
str,
|
| 59 |
"The prompt to use for relabelling blocks.",
|
|
|
|
| 111 |
print(f"Error relabelling blocks: {e}")
|
| 112 |
|
| 113 |
def relabel_blocks(self, document: Document):
|
| 114 |
+
pbar = tqdm(desc="LLM layout relabelling", disable=self.disable_tqdm)
|
| 115 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 116 |
futures = []
|
| 117 |
for page in document.pages:
|
marker/processors/equation.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
from typing import Annotated, List, Optional, Tuple
|
| 2 |
-
|
| 3 |
-
from texify.inference import batch_inference
|
| 4 |
-
from texify.model.model import GenerateVisionEncoderDecoderModel
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
| 7 |
from marker.models import TexifyPredictor
|
|
@@ -32,6 +29,10 @@ class EquationProcessor(BaseProcessor):
|
|
| 32 |
int,
|
| 33 |
"The number of tokens to buffer above max for the Texify model.",
|
| 34 |
] = 256
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def __init__(self, texify_model: TexifyPredictor, config=None):
|
| 37 |
super().__init__(config)
|
|
@@ -80,7 +81,7 @@ class EquationProcessor(BaseProcessor):
|
|
| 80 |
predictions = [""] * len(equation_data)
|
| 81 |
batch_size = self.get_batch_size()
|
| 82 |
|
| 83 |
-
for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations"):
|
| 84 |
# Dynamically set max length to save inference time
|
| 85 |
min_idx = i
|
| 86 |
max_idx = min(min_idx + batch_size, len(equation_data))
|
|
|
|
| 1 |
from typing import Annotated, List, Optional, Tuple
|
|
|
|
|
|
|
|
|
|
| 2 |
from tqdm import tqdm
|
| 3 |
|
| 4 |
from marker.models import TexifyPredictor
|
|
|
|
| 29 |
int,
|
| 30 |
"The number of tokens to buffer above max for the Texify model.",
|
| 31 |
] = 256
|
| 32 |
+
disable_tqdm: Annotated[
|
| 33 |
+
bool,
|
| 34 |
+
"Whether to disable the tqdm progress bar.",
|
| 35 |
+
] = False
|
| 36 |
|
| 37 |
def __init__(self, texify_model: TexifyPredictor, config=None):
|
| 38 |
super().__init__(config)
|
|
|
|
| 81 |
predictions = [""] * len(equation_data)
|
| 82 |
batch_size = self.get_batch_size()
|
| 83 |
|
| 84 |
+
for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations", disable=self.disable_tqdm):
|
| 85 |
# Dynamically set max length to save inference time
|
| 86 |
min_idx = i
|
| 87 |
max_idx = min(min_idx + batch_size, len(equation_data))
|
marker/processors/llm/__init__.py
CHANGED
|
@@ -44,6 +44,10 @@ class BaseLLMProcessor(BaseProcessor):
|
|
| 44 |
bool,
|
| 45 |
"Whether to use the LLM model.",
|
| 46 |
] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
block_types = None
|
| 48 |
|
| 49 |
def __init__(self, config=None):
|
|
@@ -73,7 +77,7 @@ class BaseLLMProcessor(BaseProcessor):
|
|
| 73 |
if total_blocks == 0:
|
| 74 |
return
|
| 75 |
|
| 76 |
-
pbar = tqdm(desc=f"{self.__class__.__name__} running")
|
| 77 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 78 |
for future in as_completed([
|
| 79 |
executor.submit(self.process_rewriting, document, page, block)
|
|
|
|
| 44 |
bool,
|
| 45 |
"Whether to use the LLM model.",
|
| 46 |
] = False
|
| 47 |
+
disable_tqdm: Annotated[
|
| 48 |
+
bool,
|
| 49 |
+
"Whether to disable the tqdm progress bar.",
|
| 50 |
+
] = False
|
| 51 |
block_types = None
|
| 52 |
|
| 53 |
def __init__(self, config=None):
|
|
|
|
| 77 |
if total_blocks == 0:
|
| 78 |
return
|
| 79 |
|
| 80 |
+
pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
|
| 81 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 82 |
for future in as_completed([
|
| 83 |
executor.submit(self.process_rewriting, document, page, block)
|
marker/processors/llm/llm_table_merge.py
CHANGED
|
@@ -44,6 +44,10 @@ class LLMTableMergeProcessor(BaseLLMProcessor):
|
|
| 44 |
int,
|
| 45 |
"The maximum gap between columns to merge tables"
|
| 46 |
] = 50
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
table_merge_prompt: Annotated[
|
| 48 |
str,
|
| 49 |
"The prompt to use for rewriting text.",
|
|
@@ -137,7 +141,7 @@ Table 2
|
|
| 137 |
return max_cols
|
| 138 |
|
| 139 |
def rewrite_blocks(self, document: Document):
|
| 140 |
-
pbar = tqdm(desc=f"{self.__class__.__name__} running")
|
| 141 |
table_runs = []
|
| 142 |
table_run = []
|
| 143 |
prev_block = None
|
|
|
|
| 44 |
int,
|
| 45 |
"The maximum gap between columns to merge tables"
|
| 46 |
] = 50
|
| 47 |
+
disable_tqdm: Annotated[
|
| 48 |
+
bool,
|
| 49 |
+
"Whether to disable the tqdm progress bar.",
|
| 50 |
+
] = False
|
| 51 |
table_merge_prompt: Annotated[
|
| 52 |
str,
|
| 53 |
"The prompt to use for rewriting text.",
|
|
|
|
| 141 |
return max_cols
|
| 142 |
|
| 143 |
def rewrite_blocks(self, document: Document):
|
| 144 |
+
pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
|
| 145 |
table_runs = []
|
| 146 |
table_run = []
|
| 147 |
prev_block = None
|
poetry.lock
CHANGED
|
@@ -2729,6 +2729,18 @@ files = [
|
|
| 2729 |
[package.dependencies]
|
| 2730 |
nvidia-nvjitlink-cu12 = "*"
|
| 2731 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2732 |
[[package]]
|
| 2733 |
name = "nvidia-nccl-cu12"
|
| 2734 |
version = "2.21.5"
|
|
@@ -3566,6 +3578,23 @@ files = [
|
|
| 3566 |
[package.extras]
|
| 3567 |
windows-terminal = ["colorama (>=0.4.6)"]
|
| 3568 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3569 |
[[package]]
|
| 3570 |
name = "pyparsing"
|
| 3571 |
version = "3.2.1"
|
|
@@ -4729,27 +4758,6 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
|
|
| 4729 |
test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"]
|
| 4730 |
typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"]
|
| 4731 |
|
| 4732 |
-
[[package]]
|
| 4733 |
-
name = "texify"
|
| 4734 |
-
version = "0.2.1"
|
| 4735 |
-
description = "OCR for latex images"
|
| 4736 |
-
optional = false
|
| 4737 |
-
python-versions = "<4.0,>=3.10"
|
| 4738 |
-
files = [
|
| 4739 |
-
{file = "texify-0.2.1-py3-none-any.whl", hash = "sha256:861c90ea6167fb6c2b334d5fcf0116dd9e1585af359463dec83115891c09dcfa"},
|
| 4740 |
-
{file = "texify-0.2.1.tar.gz", hash = "sha256:bab30f8445aa60e36de122fb86deb77b3f25348a885d4d5f3c67d6b6f5bb2e81"},
|
| 4741 |
-
]
|
| 4742 |
-
|
| 4743 |
-
[package.dependencies]
|
| 4744 |
-
ftfy = ">=6.1.3,<7.0.0"
|
| 4745 |
-
Pillow = ">=10.1.0,<11.0.0"
|
| 4746 |
-
pydantic = ">=2.5.2,<3.0.0"
|
| 4747 |
-
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4748 |
-
pypdfium2 = ">=4.25.0,<5.0.0"
|
| 4749 |
-
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4750 |
-
torch = ">=2.1.2,<3.0.0"
|
| 4751 |
-
transformers = ">=4.36.2,<5.0.0"
|
| 4752 |
-
|
| 4753 |
[[package]]
|
| 4754 |
name = "threadpoolctl"
|
| 4755 |
version = "3.5.0"
|
|
@@ -4865,28 +4873,31 @@ files = [
|
|
| 4865 |
|
| 4866 |
[[package]]
|
| 4867 |
name = "torch"
|
| 4868 |
-
version = "2.
|
| 4869 |
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
| 4870 |
optional = false
|
| 4871 |
-
python-versions = ">=3.
|
| 4872 |
files = [
|
| 4873 |
-
{file = "torch-2.
|
| 4874 |
-
{file = "torch-2.
|
| 4875 |
-
{file = "torch-2.
|
| 4876 |
-
{file = "torch-2.
|
| 4877 |
-
{file = "torch-2.
|
| 4878 |
-
{file = "torch-2.
|
| 4879 |
-
{file = "torch-2.
|
| 4880 |
-
{file = "torch-2.
|
| 4881 |
-
{file = "torch-2.
|
| 4882 |
-
{file = "torch-2.
|
| 4883 |
-
{file = "torch-2.
|
| 4884 |
-
{file = "torch-2.
|
| 4885 |
-
{file = "torch-2.
|
| 4886 |
-
{file = "torch-2.
|
| 4887 |
-
{file = "torch-2.
|
| 4888 |
-
{file = "torch-2.
|
| 4889 |
-
{file = "torch-2.
|
|
|
|
|
|
|
|
|
|
| 4890 |
]
|
| 4891 |
|
| 4892 |
[package.dependencies]
|
|
@@ -4903,17 +4914,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux
|
|
| 4903 |
nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4904 |
nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4905 |
nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
|
|
|
| 4906 |
nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4907 |
nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4908 |
nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4909 |
setuptools = {version = "*", markers = "python_version >= \"3.12\""}
|
| 4910 |
sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
|
| 4911 |
-
triton = {version = "3.
|
| 4912 |
-
typing-extensions = ">=4.
|
| 4913 |
|
| 4914 |
[package.extras]
|
| 4915 |
opt-einsum = ["opt-einsum (>=3.3)"]
|
| 4916 |
-
optree = ["optree (>=0.
|
| 4917 |
|
| 4918 |
[[package]]
|
| 4919 |
name = "tornado"
|
|
@@ -5042,21 +5054,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
|
|
| 5042 |
|
| 5043 |
[[package]]
|
| 5044 |
name = "triton"
|
| 5045 |
-
version = "3.
|
| 5046 |
description = "A language and compiler for custom Deep Learning operations"
|
| 5047 |
optional = false
|
| 5048 |
python-versions = "*"
|
| 5049 |
files = [
|
| 5050 |
-
{file = "triton-3.
|
| 5051 |
-
{file = "triton-3.
|
| 5052 |
-
{file = "triton-3.
|
| 5053 |
-
{file = "triton-3.
|
| 5054 |
-
{file = "triton-3.
|
| 5055 |
]
|
| 5056 |
|
| 5057 |
-
[package.dependencies]
|
| 5058 |
-
filelock = "*"
|
| 5059 |
-
|
| 5060 |
[package.extras]
|
| 5061 |
build = ["cmake (>=3.20)", "lit"]
|
| 5062 |
tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
|
|
@@ -5489,4 +5498,4 @@ propcache = ">=0.2.0"
|
|
| 5489 |
[metadata]
|
| 5490 |
lock-version = "2.0"
|
| 5491 |
python-versions = "^3.10"
|
| 5492 |
-
content-hash = "
|
|
|
|
| 2729 |
[package.dependencies]
|
| 2730 |
nvidia-nvjitlink-cu12 = "*"
|
| 2731 |
|
| 2732 |
+
[[package]]
|
| 2733 |
+
name = "nvidia-cusparselt-cu12"
|
| 2734 |
+
version = "0.6.2"
|
| 2735 |
+
description = "NVIDIA cuSPARSELt"
|
| 2736 |
+
optional = false
|
| 2737 |
+
python-versions = "*"
|
| 2738 |
+
files = [
|
| 2739 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
|
| 2740 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
|
| 2741 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
|
| 2742 |
+
]
|
| 2743 |
+
|
| 2744 |
[[package]]
|
| 2745 |
name = "nvidia-nccl-cu12"
|
| 2746 |
version = "2.21.5"
|
|
|
|
| 3578 |
[package.extras]
|
| 3579 |
windows-terminal = ["colorama (>=0.4.6)"]
|
| 3580 |
|
| 3581 |
+
[[package]]
|
| 3582 |
+
name = "pymupdf"
|
| 3583 |
+
version = "1.25.2"
|
| 3584 |
+
description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
|
| 3585 |
+
optional = false
|
| 3586 |
+
python-versions = ">=3.9"
|
| 3587 |
+
files = [
|
| 3588 |
+
{file = "pymupdf-1.25.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:59dea22b633cc4fc13670b4c5db50d71f8cd4f420814420f33ce47ddcb61e1f6"},
|
| 3589 |
+
{file = "pymupdf-1.25.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:e8b8a874497cd0deee89a6a4fb76a3a08173c8d39e88fc7cf715764ec5a243e9"},
|
| 3590 |
+
{file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f61e5cdb25b86eb28d34aa3557b49ecf9e361d5f5cd3b1660406f8f0bf813af7"},
|
| 3591 |
+
{file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8cfa7a97d78f813d286ecba32369059d88073edd1e5cf105f4cd0811f71925"},
|
| 3592 |
+
{file = "pymupdf-1.25.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:295505fe1ecb7c7b57d4124d373e207ea311d8e40bc7ac3016d8ec2d60b091e9"},
|
| 3593 |
+
{file = "pymupdf-1.25.2-cp39-abi3-win32.whl", hash = "sha256:b9488c8b82bb9be36fb13ee0c8d43b0ddcc50af83b61da01e6040413d9e67da6"},
|
| 3594 |
+
{file = "pymupdf-1.25.2-cp39-abi3-win_amd64.whl", hash = "sha256:1b4ca6f5780d319a08dff885a5a0e3585c5d7af04dcfa063c535b88371fd91c1"},
|
| 3595 |
+
{file = "pymupdf-1.25.2.tar.gz", hash = "sha256:9ea88ff1b3ccb359620f106a6fd5ba6877d959d21d78272052c3496ceede6eec"},
|
| 3596 |
+
]
|
| 3597 |
+
|
| 3598 |
[[package]]
|
| 3599 |
name = "pyparsing"
|
| 3600 |
version = "3.2.1"
|
|
|
|
| 4758 |
test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"]
|
| 4759 |
typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"]
|
| 4760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4761 |
[[package]]
|
| 4762 |
name = "threadpoolctl"
|
| 4763 |
version = "3.5.0"
|
|
|
|
| 4873 |
|
| 4874 |
[[package]]
|
| 4875 |
name = "torch"
|
| 4876 |
+
version = "2.6.0"
|
| 4877 |
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
| 4878 |
optional = false
|
| 4879 |
+
python-versions = ">=3.9.0"
|
| 4880 |
files = [
|
| 4881 |
+
{file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
|
| 4882 |
+
{file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
|
| 4883 |
+
{file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
|
| 4884 |
+
{file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
|
| 4885 |
+
{file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
|
| 4886 |
+
{file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
|
| 4887 |
+
{file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
|
| 4888 |
+
{file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
|
| 4889 |
+
{file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
|
| 4890 |
+
{file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
|
| 4891 |
+
{file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
|
| 4892 |
+
{file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
|
| 4893 |
+
{file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
|
| 4894 |
+
{file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
|
| 4895 |
+
{file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
|
| 4896 |
+
{file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
|
| 4897 |
+
{file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
|
| 4898 |
+
{file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
|
| 4899 |
+
{file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
|
| 4900 |
+
{file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
|
| 4901 |
]
|
| 4902 |
|
| 4903 |
[package.dependencies]
|
|
|
|
| 4914 |
nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4915 |
nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4916 |
nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4917 |
+
nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4918 |
nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4919 |
nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4920 |
nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4921 |
setuptools = {version = "*", markers = "python_version >= \"3.12\""}
|
| 4922 |
sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
|
| 4923 |
+
triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4924 |
+
typing-extensions = ">=4.10.0"
|
| 4925 |
|
| 4926 |
[package.extras]
|
| 4927 |
opt-einsum = ["opt-einsum (>=3.3)"]
|
| 4928 |
+
optree = ["optree (>=0.13.0)"]
|
| 4929 |
|
| 4930 |
[[package]]
|
| 4931 |
name = "tornado"
|
|
|
|
| 5054 |
|
| 5055 |
[[package]]
|
| 5056 |
name = "triton"
|
| 5057 |
+
version = "3.2.0"
|
| 5058 |
description = "A language and compiler for custom Deep Learning operations"
|
| 5059 |
optional = false
|
| 5060 |
python-versions = "*"
|
| 5061 |
files = [
|
| 5062 |
+
{file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
|
| 5063 |
+
{file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
|
| 5064 |
+
{file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
|
| 5065 |
+
{file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
|
| 5066 |
+
{file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
|
| 5067 |
]
|
| 5068 |
|
|
|
|
|
|
|
|
|
|
| 5069 |
[package.extras]
|
| 5070 |
build = ["cmake (>=3.20)", "lit"]
|
| 5071 |
tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
|
|
|
|
| 5498 |
[metadata]
|
| 5499 |
lock-version = "2.0"
|
| 5500 |
python-versions = "^3.10"
|
| 5501 |
+
content-hash = "9d330f12a8bad0352ec550e1d6a77348b10f6bca7ecc41769813bec85d3f9e08"
|
pyproject.toml
CHANGED
|
@@ -25,7 +25,6 @@ python-dotenv = "^1.0.0"
|
|
| 25 |
torch = "^2.5.1"
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
-
texify = "^0.2.1"
|
| 29 |
rapidfuzz = "^3.8.1"
|
| 30 |
surya-ocr = "~0.10.0"
|
| 31 |
regex = "^2024.4.28"
|
|
@@ -50,6 +49,7 @@ apted = "1.0.3"
|
|
| 50 |
distance = "0.1.3"
|
| 51 |
lxml = "5.3.0"
|
| 52 |
tabulate = "^0.9.0"
|
|
|
|
| 53 |
|
| 54 |
[tool.poetry.scripts]
|
| 55 |
marker = "marker.scripts.convert:convert_cli"
|
|
|
|
| 25 |
torch = "^2.5.1"
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
|
|
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
surya-ocr = "~0.10.0"
|
| 30 |
regex = "^2024.4.28"
|
|
|
|
| 49 |
distance = "0.1.3"
|
| 50 |
lxml = "5.3.0"
|
| 51 |
tabulate = "^0.9.0"
|
| 52 |
+
pymupdf = "^1.25.2"
|
| 53 |
|
| 54 |
[tool.poetry.scripts]
|
| 55 |
marker = "marker.scripts.convert:convert_cli"
|