Vik Paruchuri commited on
Commit
5a6b7c2
·
1 Parent(s): a19295b

Finalize dataset uploading

Browse files
benchmarks/overall/display/__init__.py CHANGED
@@ -1,62 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List
3
-
4
- import tabulate
5
-
6
- from benchmarks.overall.schema import FullResult
7
-
8
- def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
9
- table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10
- with open(out_path / filename, "w", encoding="utf-8") as f:
11
- f.write(f"# {title}\n")
12
- f.write(table)
13
- print(title)
14
- print(table)
15
-
16
-
17
- def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18
- document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19
- headers = ["Document Type"]
20
- for method in methods:
21
- for score_type in score_types:
22
- headers.append(f"{method} {score_type}")
23
-
24
- document_rows = [[k] for k in document_types]
25
- for i, doc_type in enumerate(document_types):
26
- for method in methods:
27
- for score_type in score_types:
28
- avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29
- document_rows[i].append(avg_score)
30
-
31
- write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32
-
33
- headers = ["Block Type"]
34
- block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35
- block_score_types = list(result["averages_by_block_type"][default_method].keys())
36
- for method in methods:
37
- for score_type in block_score_types:
38
- headers.append(f"{method} {score_type}")
39
-
40
- block_rows = [[k] for k in block_types]
41
- for i, block_type in enumerate(block_types):
42
- for method in methods:
43
- for score_type in block_score_types:
44
- avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45
- block_rows[i].append(avg_score)
46
-
47
- write_table("Block types", block_rows, headers, out_path, "block_types.md")
48
-
49
- headers = ["Method", "Avg Time"] + score_types
50
- inference_rows = [[k] for k in methods]
51
- all_raw_scores = [result["scores"][i] for i in result["scores"]]
52
- for i, method in enumerate(methods):
53
- avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54
- inference_rows[i].append(avg_time)
55
- for score_type in score_types:
56
- scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
57
- avg_score = sum(scores_lst) / max(1, len(scores_lst))
58
- inference_rows[i].append(avg_score)
59
-
60
- write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
61
-
62
- print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/overall/display/dataset.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List
3
+
4
+ import datasets
5
+ from tqdm import tqdm
6
+
7
+ from benchmarks.overall.registry import METHOD_REGISTRY
8
+ from benchmarks.overall.schema import FullResult
9
+
10
+
11
+ def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str]) -> datasets.Dataset:
12
+ rows = []
13
+ for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
14
+ if idx not in result["markdown"]:
15
+ continue
16
+
17
+ row = {
18
+ "uuid": sample["uuid"],
19
+ "classification": sample["classification"],
20
+ "language": sample["language"],
21
+ "img": sample["img"],
22
+ }
23
+ for method in result["markdown"][idx]:
24
+ if method == "gt":
25
+ continue
26
+
27
+ method_cls = METHOD_REGISTRY[method]()
28
+ md = result["markdown"][idx][method]
29
+ method_img = method_cls.render(result["markdown"][idx][method])
30
+ row[f"{method}_md"] = md
31
+ row[f"{method}_img"] = method_img
32
+
33
+ for score_type in score_types:
34
+ row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
35
+ row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
36
+ rows.append(row)
37
+ ds = datasets.Dataset.from_list(rows)
38
+ return ds
39
+
benchmarks/overall/display/table.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, List
3
+
4
+ import tabulate
5
+
6
+ from benchmarks.overall.schema import FullResult
7
+
8
+ def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
9
+ table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10
+ with open(out_path / filename, "w", encoding="utf-8") as f:
11
+ f.write(f"# {title}\n")
12
+ f.write(table)
13
+ print(title)
14
+ print(table)
15
+
16
+
17
+ def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18
+ document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19
+ headers = ["Document Type"]
20
+ for method in methods:
21
+ for score_type in score_types:
22
+ headers.append(f"{method} {score_type}")
23
+
24
+ document_rows = [[k] for k in document_types]
25
+ for i, doc_type in enumerate(document_types):
26
+ for method in methods:
27
+ for score_type in score_types:
28
+ avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29
+ document_rows[i].append(avg_score)
30
+
31
+ write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32
+
33
+ headers = ["Block Type"]
34
+ block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35
+ block_score_types = list(result["averages_by_block_type"][default_method].keys())
36
+ for method in methods:
37
+ for score_type in block_score_types:
38
+ headers.append(f"{method} {score_type}")
39
+
40
+ block_rows = [[k] for k in block_types]
41
+ for i, block_type in enumerate(block_types):
42
+ for method in methods:
43
+ for score_type in block_score_types:
44
+ avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45
+ block_rows[i].append(avg_score)
46
+
47
+ write_table("Block types", block_rows, headers, out_path, "block_types.md")
48
+
49
+ headers = ["Method", "Avg Time"] + score_types
50
+ inference_rows = [[k] for k in methods]
51
+ all_raw_scores = [result["scores"][i] for i in result["scores"]]
52
+ for i, method in enumerate(methods):
53
+ avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54
+ inference_rows[i].append(avg_time)
55
+ for score_type in score_types:
56
+ scores_lst = [ar[method][score_type]["score"] for ar in all_raw_scores]
57
+ avg_score = sum(scores_lst) / max(1, len(scores_lst))
58
+ inference_rows[i].append(avg_score)
59
+
60
+ write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
61
+
62
+ print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
benchmarks/overall/overall.py CHANGED
@@ -8,12 +8,13 @@ import click
8
  import datasets
9
  from tqdm import tqdm
10
 
 
11
  from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
12
  from benchmarks.overall.schema import FullResult
13
  from marker.logger import configure_logging
14
  from marker.models import create_model_dict
15
  from marker.settings import settings
16
- from benchmarks.overall.display import print_scores
17
 
18
  configure_logging()
19
 
@@ -32,6 +33,7 @@ def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], s
32
  gt_cls = METHOD_REGISTRY["gt"]
33
  gt_blocks = json.loads(sample["gt_blocks"])
34
  gt_md = gt_cls(**artifacts)(sample)["markdown"]
 
35
 
36
  out_data = defaultdict(dict)
37
 
@@ -115,9 +117,14 @@ def main(
115
  # Display benchmark scoring tables
116
  print_scores(result, out_path, methods, score_types)
117
 
 
118
  with open(out_path / "result.json", "w") as f:
119
  json.dump(result, f)
120
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  main()
 
8
  import datasets
9
  from tqdm import tqdm
10
 
11
+ from benchmarks.overall.display.dataset import build_dataset
12
  from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
13
  from benchmarks.overall.schema import FullResult
14
  from marker.logger import configure_logging
15
  from marker.models import create_model_dict
16
  from marker.settings import settings
17
+ from benchmarks.overall.display.table import print_scores
18
 
19
  configure_logging()
20
 
 
33
  gt_cls = METHOD_REGISTRY["gt"]
34
  gt_blocks = json.loads(sample["gt_blocks"])
35
  gt_md = gt_cls(**artifacts)(sample)["markdown"]
36
+ markdown_by_method[idx]["gt"] = gt_md
37
 
38
  out_data = defaultdict(dict)
39
 
 
117
  # Display benchmark scoring tables
118
  print_scores(result, out_path, methods, score_types)
119
 
120
+ # Write to json
121
  with open(out_path / "result.json", "w") as f:
122
  json.dump(result, f)
123
 
124
+ if out_dataset:
125
+ dataset = build_dataset(benchmark_dataset, result, score_types)
126
+ dataset.push_to_hub(out_dataset)
127
+
128
 
129
  if __name__ == "__main__":
130
  main()
benchmarks/verify_scores.py CHANGED
@@ -6,7 +6,9 @@ def verify_scores(file_path):
6
  with open(file_path, 'r') as file:
7
  data = json.load(file)
8
 
9
- marker_score = data["marker"]["average_score"]
 
 
10
  if marker_score < 90:
11
  raise ValueError("Marker score below 90")
12
 
 
6
  with open(file_path, 'r') as file:
7
  data = json.load(file)
8
 
9
+ raw_scores = [data["scores"][k] for k in data["scores"]]
10
+ marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
11
+ marker_score = sum(marker_scores) / len(marker_scores)
12
  if marker_score < 90:
13
  raise ValueError("Marker score below 90")
14