Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Feb 4

Commit

5a6b7c2

1 Parent(s): a19295b

Finalize dataset uploading

Browse files

Files changed (5) hide show

benchmarks/overall/display/__init__.py +0 -62
benchmarks/overall/display/dataset.py +39 -0
benchmarks/overall/display/table.py +62 -0
benchmarks/overall/overall.py +8 -1
benchmarks/verify_scores.py +3 -1

benchmarks/overall/display/__init__.py CHANGED Viewed

@@ -1,62 +0,0 @@
-from pathlib import Path
-from typing import Dict, List
-import tabulate
-from benchmarks.overall.schema import FullResult
-def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
-    table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
-    with open(out_path / filename, "w", encoding="utf-8") as f:
-        f.write(f"# {title}\n")
-        f.write(table)
-    print(title)
-    print(table)
-def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
-    document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
-    headers = ["Document Type"]
-    for method in methods:
-        for score_type in score_types:
-            headers.append(f"{method} {score_type}")
-    document_rows = [[k] for k in document_types]
-    for i, doc_type in enumerate(document_types):
-        for method in methods:
-            for score_type in score_types:
-                avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
-                document_rows[i].append(avg_score)
-    write_table("Document Types", document_rows, headers, out_path, "document_types.md")
-    headers = ["Block Type"]
-    block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
-    block_score_types = list(result["averages_by_block_type"][default_method].keys())
-    for method in methods:
-        for score_type in block_score_types:
-            headers.append(f"{method} {score_type}")
-    block_rows = [[k] for k in block_types]
-    for i, block_type in enumerate(block_types):
-        for method in methods:
-            for score_type in block_score_types:
-                avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
-                block_rows[i].append(avg_score)
-    write_table("Block types", block_rows, headers, out_path, "block_types.md")
-    headers = ["Method",  "Avg Time"] + score_types
-    inference_rows = [[k] for k in methods]
-    all_raw_scores = [result["scores"][i] for i in result["scores"]]
-    for i, method in enumerate(methods):
-        avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
-        inference_rows[i].append(avg_time)
-        for score_type in score_types:
-            scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
-            avg_score = sum(scores_lst) / max(1, len(scores_lst))
-            inference_rows[i].append(avg_score)
-    write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
-    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")

benchmarks/overall/display/dataset.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+from typing import List
+import datasets
+from tqdm import tqdm
+from benchmarks.overall.registry import METHOD_REGISTRY
+from benchmarks.overall.schema import FullResult
+def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str]) -> datasets.Dataset:
+    rows = []
+    for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
+        if idx not in result["markdown"]:
+            continue
+        row = {
+            "uuid": sample["uuid"],
+            "classification": sample["classification"],
+            "language": sample["language"],
+            "img": sample["img"],
+        }
+        for method in result["markdown"][idx]:
+            if method == "gt":
+                continue
+            method_cls = METHOD_REGISTRY[method]()
+            md = result["markdown"][idx][method]
+            method_img = method_cls.render(result["markdown"][idx][method])
+            row[f"{method}_md"] = md
+            row[f"{method}_img"] = method_img
+            for score_type in score_types:
+                row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
+                row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
+        rows.append(row)
+    ds = datasets.Dataset.from_list(rows)
+    return ds

benchmarks/overall/display/table.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from pathlib import Path
+from typing import Dict, List
+import tabulate
+from benchmarks.overall.schema import FullResult
+def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
+    table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
+    with open(out_path / filename, "w", encoding="utf-8") as f:
+        f.write(f"# {title}\n")
+        f.write(table)
+    print(title)
+    print(table)
+def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
+    document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
+    headers = ["Document Type"]
+    for method in methods:
+        for score_type in score_types:
+            headers.append(f"{method} {score_type}")
+    document_rows = [[k] for k in document_types]
+    for i, doc_type in enumerate(document_types):
+        for method in methods:
+            for score_type in score_types:
+                avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
+                document_rows[i].append(avg_score)
+    write_table("Document Types", document_rows, headers, out_path, "document_types.md")
+    headers = ["Block Type"]
+    block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
+    block_score_types = list(result["averages_by_block_type"][default_method].keys())
+    for method in methods:
+        for score_type in block_score_types:
+            headers.append(f"{method} {score_type}")
+    block_rows = [[k] for k in block_types]
+    for i, block_type in enumerate(block_types):
+        for method in methods:
+            for score_type in block_score_types:
+                avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
+                block_rows[i].append(avg_score)
+    write_table("Block types", block_rows, headers, out_path, "block_types.md")
+    headers = ["Method",  "Avg Time"] + score_types
+    inference_rows = [[k] for k in methods]
+    all_raw_scores = [result["scores"][i] for i in result["scores"]]
+    for i, method in enumerate(methods):
+        avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
+        inference_rows[i].append(avg_time)
+        for score_type in score_types:
+            scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
+            avg_score = sum(scores_lst) / max(1, len(scores_lst))
+            inference_rows[i].append(avg_score)
+    write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
+    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")

benchmarks/overall/overall.py CHANGED Viewed

@@ -8,12 +8,13 @@ import click
 import datasets
 from tqdm import tqdm
 from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
 from marker.settings import settings
-from benchmarks.overall.display import print_scores
 configure_logging()
@@ -32,6 +33,7 @@ def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], s
         gt_cls = METHOD_REGISTRY["gt"]
         gt_blocks = json.loads(sample["gt_blocks"])
         gt_md = gt_cls(**artifacts)(sample)["markdown"]
         out_data = defaultdict(dict)
@@ -115,9 +117,14 @@ def main(
     # Display benchmark scoring tables
     print_scores(result, out_path, methods, score_types)
     with open(out_path / "result.json", "w") as f:
         json.dump(result, f)
 if __name__ == "__main__":
     main()

 import datasets
 from tqdm import tqdm
+from benchmarks.overall.display.dataset import build_dataset
 from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
 from marker.settings import settings
+from benchmarks.overall.display.table import print_scores
 configure_logging()
         gt_cls = METHOD_REGISTRY["gt"]
         gt_blocks = json.loads(sample["gt_blocks"])
         gt_md = gt_cls(**artifacts)(sample)["markdown"]
+        markdown_by_method[idx]["gt"] = gt_md
         out_data = defaultdict(dict)
     # Display benchmark scoring tables
     print_scores(result, out_path, methods, score_types)
+    # Write to json
     with open(out_path / "result.json", "w") as f:
         json.dump(result, f)
+    if out_dataset:
+        dataset = build_dataset(benchmark_dataset, result, score_types)
+        dataset.push_to_hub(out_dataset)
 if __name__ == "__main__":
     main()

benchmarks/verify_scores.py CHANGED Viewed

@@ -6,7 +6,9 @@ def verify_scores(file_path):
     with open(file_path, 'r') as file:
         data = json.load(file)
-    marker_score = data["marker"]["average_score"]
     if marker_score < 90:
         raise ValueError("Marker score below 90")

     with open(file_path, 'r') as file:
         data = json.load(file)
+    raw_scores = [data["scores"][k] for k in data["scores"]]
+    marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
+    marker_score = sum(marker_scores) / len(marker_scores)
     if marker_score < 90:
         raise ValueError("Marker score below 90")