Vik Paruchuri
commited on
Commit
·
5a6b7c2
1
Parent(s):
a19295b
Finalize dataset uploading
Browse files
benchmarks/overall/display/__init__.py
CHANGED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
from typing import Dict, List
|
| 3 |
-
|
| 4 |
-
import tabulate
|
| 5 |
-
|
| 6 |
-
from benchmarks.overall.schema import FullResult
|
| 7 |
-
|
| 8 |
-
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
|
| 9 |
-
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
|
| 10 |
-
with open(out_path / filename, "w", encoding="utf-8") as f:
|
| 11 |
-
f.write(f"# {title}\n")
|
| 12 |
-
f.write(table)
|
| 13 |
-
print(title)
|
| 14 |
-
print(table)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
|
| 18 |
-
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
|
| 19 |
-
headers = ["Document Type"]
|
| 20 |
-
for method in methods:
|
| 21 |
-
for score_type in score_types:
|
| 22 |
-
headers.append(f"{method} {score_type}")
|
| 23 |
-
|
| 24 |
-
document_rows = [[k] for k in document_types]
|
| 25 |
-
for i, doc_type in enumerate(document_types):
|
| 26 |
-
for method in methods:
|
| 27 |
-
for score_type in score_types:
|
| 28 |
-
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
|
| 29 |
-
document_rows[i].append(avg_score)
|
| 30 |
-
|
| 31 |
-
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
|
| 32 |
-
|
| 33 |
-
headers = ["Block Type"]
|
| 34 |
-
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
|
| 35 |
-
block_score_types = list(result["averages_by_block_type"][default_method].keys())
|
| 36 |
-
for method in methods:
|
| 37 |
-
for score_type in block_score_types:
|
| 38 |
-
headers.append(f"{method} {score_type}")
|
| 39 |
-
|
| 40 |
-
block_rows = [[k] for k in block_types]
|
| 41 |
-
for i, block_type in enumerate(block_types):
|
| 42 |
-
for method in methods:
|
| 43 |
-
for score_type in block_score_types:
|
| 44 |
-
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
|
| 45 |
-
block_rows[i].append(avg_score)
|
| 46 |
-
|
| 47 |
-
write_table("Block types", block_rows, headers, out_path, "block_types.md")
|
| 48 |
-
|
| 49 |
-
headers = ["Method", "Avg Time"] + score_types
|
| 50 |
-
inference_rows = [[k] for k in methods]
|
| 51 |
-
all_raw_scores = [result["scores"][i] for i in result["scores"]]
|
| 52 |
-
for i, method in enumerate(methods):
|
| 53 |
-
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
|
| 54 |
-
inference_rows[i].append(avg_time)
|
| 55 |
-
for score_type in score_types:
|
| 56 |
-
scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
|
| 57 |
-
avg_score = sum(scores_lst) / max(1, len(scores_lst))
|
| 58 |
-
inference_rows[i].append(avg_score)
|
| 59 |
-
|
| 60 |
-
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
|
| 61 |
-
|
| 62 |
-
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/overall/display/dataset.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
import datasets
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
from benchmarks.overall.registry import METHOD_REGISTRY
|
| 8 |
+
from benchmarks.overall.schema import FullResult
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str]) -> datasets.Dataset:
|
| 12 |
+
rows = []
|
| 13 |
+
for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
|
| 14 |
+
if idx not in result["markdown"]:
|
| 15 |
+
continue
|
| 16 |
+
|
| 17 |
+
row = {
|
| 18 |
+
"uuid": sample["uuid"],
|
| 19 |
+
"classification": sample["classification"],
|
| 20 |
+
"language": sample["language"],
|
| 21 |
+
"img": sample["img"],
|
| 22 |
+
}
|
| 23 |
+
for method in result["markdown"][idx]:
|
| 24 |
+
if method == "gt":
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
+
method_cls = METHOD_REGISTRY[method]()
|
| 28 |
+
md = result["markdown"][idx][method]
|
| 29 |
+
method_img = method_cls.render(result["markdown"][idx][method])
|
| 30 |
+
row[f"{method}_md"] = md
|
| 31 |
+
row[f"{method}_img"] = method_img
|
| 32 |
+
|
| 33 |
+
for score_type in score_types:
|
| 34 |
+
row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
|
| 35 |
+
row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
|
| 36 |
+
rows.append(row)
|
| 37 |
+
ds = datasets.Dataset.from_list(rows)
|
| 38 |
+
return ds
|
| 39 |
+
|
benchmarks/overall/display/table.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
import tabulate
|
| 5 |
+
|
| 6 |
+
from benchmarks.overall.schema import FullResult
|
| 7 |
+
|
| 8 |
+
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
|
| 9 |
+
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
|
| 10 |
+
with open(out_path / filename, "w", encoding="utf-8") as f:
|
| 11 |
+
f.write(f"# {title}\n")
|
| 12 |
+
f.write(table)
|
| 13 |
+
print(title)
|
| 14 |
+
print(table)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
|
| 18 |
+
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
|
| 19 |
+
headers = ["Document Type"]
|
| 20 |
+
for method in methods:
|
| 21 |
+
for score_type in score_types:
|
| 22 |
+
headers.append(f"{method} {score_type}")
|
| 23 |
+
|
| 24 |
+
document_rows = [[k] for k in document_types]
|
| 25 |
+
for i, doc_type in enumerate(document_types):
|
| 26 |
+
for method in methods:
|
| 27 |
+
for score_type in score_types:
|
| 28 |
+
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
|
| 29 |
+
document_rows[i].append(avg_score)
|
| 30 |
+
|
| 31 |
+
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
|
| 32 |
+
|
| 33 |
+
headers = ["Block Type"]
|
| 34 |
+
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
|
| 35 |
+
block_score_types = list(result["averages_by_block_type"][default_method].keys())
|
| 36 |
+
for method in methods:
|
| 37 |
+
for score_type in block_score_types:
|
| 38 |
+
headers.append(f"{method} {score_type}")
|
| 39 |
+
|
| 40 |
+
block_rows = [[k] for k in block_types]
|
| 41 |
+
for i, block_type in enumerate(block_types):
|
| 42 |
+
for method in methods:
|
| 43 |
+
for score_type in block_score_types:
|
| 44 |
+
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
|
| 45 |
+
block_rows[i].append(avg_score)
|
| 46 |
+
|
| 47 |
+
write_table("Block types", block_rows, headers, out_path, "block_types.md")
|
| 48 |
+
|
| 49 |
+
headers = ["Method", "Avg Time"] + score_types
|
| 50 |
+
inference_rows = [[k] for k in methods]
|
| 51 |
+
all_raw_scores = [result["scores"][i] for i in result["scores"]]
|
| 52 |
+
for i, method in enumerate(methods):
|
| 53 |
+
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
|
| 54 |
+
inference_rows[i].append(avg_time)
|
| 55 |
+
for score_type in score_types:
|
| 56 |
+
scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
|
| 57 |
+
avg_score = sum(scores_lst) / max(1, len(scores_lst))
|
| 58 |
+
inference_rows[i].append(avg_score)
|
| 59 |
+
|
| 60 |
+
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
|
| 61 |
+
|
| 62 |
+
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
|
benchmarks/overall/overall.py
CHANGED
|
@@ -8,12 +8,13 @@ import click
|
|
| 8 |
import datasets
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
|
|
|
| 11 |
from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
|
| 12 |
from benchmarks.overall.schema import FullResult
|
| 13 |
from marker.logger import configure_logging
|
| 14 |
from marker.models import create_model_dict
|
| 15 |
from marker.settings import settings
|
| 16 |
-
from benchmarks.overall.display import print_scores
|
| 17 |
|
| 18 |
configure_logging()
|
| 19 |
|
|
@@ -32,6 +33,7 @@ def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], s
|
|
| 32 |
gt_cls = METHOD_REGISTRY["gt"]
|
| 33 |
gt_blocks = json.loads(sample["gt_blocks"])
|
| 34 |
gt_md = gt_cls(**artifacts)(sample)["markdown"]
|
|
|
|
| 35 |
|
| 36 |
out_data = defaultdict(dict)
|
| 37 |
|
|
@@ -115,9 +117,14 @@ def main(
|
|
| 115 |
# Display benchmark scoring tables
|
| 116 |
print_scores(result, out_path, methods, score_types)
|
| 117 |
|
|
|
|
| 118 |
with open(out_path / "result.json", "w") as f:
|
| 119 |
json.dump(result, f)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
if __name__ == "__main__":
|
| 123 |
main()
|
|
|
|
| 8 |
import datasets
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
+
from benchmarks.overall.display.dataset import build_dataset
|
| 12 |
from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
|
| 13 |
from benchmarks.overall.schema import FullResult
|
| 14 |
from marker.logger import configure_logging
|
| 15 |
from marker.models import create_model_dict
|
| 16 |
from marker.settings import settings
|
| 17 |
+
from benchmarks.overall.display.table import print_scores
|
| 18 |
|
| 19 |
configure_logging()
|
| 20 |
|
|
|
|
| 33 |
gt_cls = METHOD_REGISTRY["gt"]
|
| 34 |
gt_blocks = json.loads(sample["gt_blocks"])
|
| 35 |
gt_md = gt_cls(**artifacts)(sample)["markdown"]
|
| 36 |
+
markdown_by_method[idx]["gt"] = gt_md
|
| 37 |
|
| 38 |
out_data = defaultdict(dict)
|
| 39 |
|
|
|
|
| 117 |
# Display benchmark scoring tables
|
| 118 |
print_scores(result, out_path, methods, score_types)
|
| 119 |
|
| 120 |
+
# Write to json
|
| 121 |
with open(out_path / "result.json", "w") as f:
|
| 122 |
json.dump(result, f)
|
| 123 |
|
| 124 |
+
if out_dataset:
|
| 125 |
+
dataset = build_dataset(benchmark_dataset, result, score_types)
|
| 126 |
+
dataset.push_to_hub(out_dataset)
|
| 127 |
+
|
| 128 |
|
| 129 |
if __name__ == "__main__":
|
| 130 |
main()
|
benchmarks/verify_scores.py
CHANGED
|
@@ -6,7 +6,9 @@ def verify_scores(file_path):
|
|
| 6 |
with open(file_path, 'r') as file:
|
| 7 |
data = json.load(file)
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
if marker_score < 90:
|
| 11 |
raise ValueError("Marker score below 90")
|
| 12 |
|
|
|
|
| 6 |
with open(file_path, 'r') as file:
|
| 7 |
data = json.load(file)
|
| 8 |
|
| 9 |
+
raw_scores = [data["scores"][k] for k in data["scores"]]
|
| 10 |
+
marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
|
| 11 |
+
marker_score = sum(marker_scores) / len(marker_scores)
|
| 12 |
if marker_score < 90:
|
| 13 |
raise ValueError("Marker score below 90")
|
| 14 |
|