zeroshotGPU / tests /test_structure_metrics.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Tests for table structure similarity, formula CER, and OmniDocBench adapters."""
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from zsgdp.benchmarks.datasets import DatasetDocument, register_dataset_loader
from zsgdp.benchmarks.datasets import _LOADERS as _DATASET_LOADERS
from zsgdp.benchmarks.ground_truth import (
omnidocbench_formula_truths,
omnidocbench_table_truths,
parsed_formula_records,
parsed_table_records,
)
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
from zsgdp.schema import Element, FigureObject, ParsedDocument, QualityReport, TableObject
from zsgdp.verify.formula_extraction import compute_formula_extraction
from zsgdp.verify.table_structure import compute_table_structure_score, html_to_rows, markdown_to_rows
class TestMarkdownAndHTMLRows(unittest.TestCase):
def test_markdown_strips_separator_row(self):
rows = markdown_to_rows("| A | B |\n| --- | --- |\n| 1 | 2 |\n")
self.assertEqual(rows, [["a", "b"], ["1", "2"]])
def test_html_handles_th_and_td(self):
html = "<table><tr><th>Col</th></tr><tr><td>val</td></tr></table>"
self.assertEqual(html_to_rows(html), [["col"], ["val"]])
class TestComputeTableStructure(unittest.TestCase):
def test_perfect_match(self):
truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}
prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}
result = compute_table_structure_score([prediction], [truth])
self.assertEqual(result["matched_pair_count"], 1)
self.assertEqual(result["mean_table_score"], 1.0)
self.assertEqual(result["mean_cell_content_f1"], 1.0)
self.assertEqual(result["table_match_rate"], 1.0)
def test_partial_overlap_scores_between_zero_and_one(self):
truth = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |", "page_num": 1}
prediction = {"markdown": "| A | B |\n| --- | --- |\n| 1 | 3 |", "page_num": 1}
result = compute_table_structure_score([prediction], [truth])
self.assertEqual(result["matched_pair_count"], 1)
self.assertGreater(result["mean_table_score"], 0.0)
self.assertLess(result["mean_table_score"], 1.0)
def test_extra_prediction_lowers_match_rate(self):
truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}
predictions = [
{"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1},
{"markdown": "| Z |\n| --- |\n| 9 |", "page_num": 2},
]
result = compute_table_structure_score(predictions, [truth])
self.assertEqual(result["matched_pair_count"], 1)
self.assertEqual(result["table_match_rate"], 0.5)
self.assertEqual(result["table_count_delta"], 1)
def test_no_matching_page_yields_no_pair(self):
truth = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 1}
prediction = {"markdown": "| A |\n| --- |\n| 1 |", "page_num": 2}
result = compute_table_structure_score([prediction], [truth])
self.assertEqual(result["matched_pair_count"], 0)
def test_empty_inputs_are_vacuous(self):
result = compute_table_structure_score([], [])
self.assertEqual(result["mean_table_score"], 1.0)
self.assertEqual(result["table_match_rate"], 1.0)
class TestComputeFormulaExtraction(unittest.TestCase):
def test_exact_match_yields_zero_cer(self):
result = compute_formula_extraction(
[{"latex": "E = mc^2", "page_num": 1}],
[{"latex": "E = mc^2", "page_num": 1}],
)
self.assertEqual(result["mean_cer"], 0.0)
self.assertEqual(result["mean_accuracy"], 1.0)
self.assertEqual(result["exact_match_rate"], 1.0)
def test_one_char_off_yields_proportional_cer(self):
result = compute_formula_extraction(
[{"latex": "E = mc^3", "page_num": 1}],
[{"latex": "E = mc^2", "page_num": 1}],
)
# Levenshtein distance 1 over reference length 8
self.assertAlmostEqual(result["mean_cer"], 1 / 8, places=6)
self.assertEqual(result["exact_match_rate"], 0.0)
def test_empty_inputs_are_vacuous(self):
result = compute_formula_extraction([], [])
self.assertEqual(result["mean_cer"], 0.0)
self.assertEqual(result["mean_accuracy"], 1.0)
def test_one_side_empty_yields_full_error(self):
result = compute_formula_extraction([], [{"latex": "x", "page_num": 1}])
self.assertEqual(result["mean_cer"], 1.0)
self.assertEqual(result["mean_accuracy"], 0.0)
def test_dollar_delimiters_stripped(self):
result = compute_formula_extraction(
[{"latex": "$$E = mc^2$$", "page_num": 1}],
[{"latex": "E = mc^2", "page_num": 1}],
)
self.assertEqual(result["exact_match_rate"], 1.0)
def test_greedy_matching_picks_lowest_cer_pair(self):
predictions = [
{"latex": "E = mc^2", "page_num": 1},
{"latex": "F = ma", "page_num": 1},
]
truths = [
{"latex": "F = ma", "page_num": 1},
{"latex": "E = mc^2", "page_num": 1},
]
result = compute_formula_extraction(predictions, truths)
self.assertEqual(result["matched_pair_count"], 2)
self.assertEqual(result["exact_match_rate"], 1.0)
class TestOmniDocBenchAdapters(unittest.TestCase):
def test_table_truths_extract_markdown_and_page(self):
gt = {
"layout_dets": [
{"category": "table", "markdown": "| A |\n| --- |\n| 1 |", "page_num": 1},
{"category": "Title", "text": "ignore", "page_num": 1},
{"category": "Table", "html": "<table><tr><td>x</td></tr></table>", "page_num": 2},
]
}
truths = omnidocbench_table_truths(gt)
self.assertEqual(len(truths), 2)
self.assertEqual(truths[0]["page_num"], 1)
self.assertEqual(truths[1]["page_num"], 2)
def test_formula_truths_extract_latex(self):
gt = {
"layout_dets": [
{"category": "formula", "latex": "E = mc^2", "page_num": 1},
{"category": "Equation", "text": "F = ma", "page_num": 2},
{"category": "Title", "text": "ignore", "page_num": 1},
]
}
truths = omnidocbench_formula_truths(gt)
self.assertEqual(len(truths), 2)
self.assertEqual(truths[0]["latex"], "E = mc^2")
self.assertEqual(truths[1]["latex"], "F = ma")
def test_unknown_shape_returns_empty(self):
self.assertEqual(omnidocbench_table_truths({"weird": True}), [])
self.assertEqual(omnidocbench_formula_truths({}), [])
class TestParsedRecords(unittest.TestCase):
def test_parsed_table_records_dedupes_object_and_element(self):
parsed = ParsedDocument(
doc_id="d1",
source_path="/tmp/d1.pdf",
file_type="pdf",
elements=[
Element(
element_id="t1",
doc_id="d1",
page_num=1,
type="table",
markdown="| A |\n| --- |\n| 1 |",
),
],
tables=[
TableObject(
table_id="t1",
page_nums=[1],
markdown="| A |\n| --- |\n| 1 |",
),
],
quality_report=QualityReport(),
)
records = parsed_table_records(parsed)
# Both table objects keyed differently, so we get 2 records (table object + element).
# The dedupe key is per-source so they stay distinct, which is fine for matching.
self.assertGreaterEqual(len(records), 1)
self.assertTrue(any(record["table_id"] == "t1" for record in records))
def test_parsed_formula_records_extract_latex(self):
parsed = ParsedDocument(
doc_id="d1",
source_path="/tmp/d1.pdf",
file_type="pdf",
elements=[
Element(element_id="f1", doc_id="d1", page_num=1, type="formula", text="E = mc^2"),
Element(element_id="p1", doc_id="d1", page_num=1, type="paragraph", text="not a formula"),
Element(element_id="f2", doc_id="d1", page_num=2, type="formula", text=""),
],
quality_report=QualityReport(),
)
records = parsed_formula_records(parsed)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]["formula_id"], "f1")
self.assertEqual(records[0]["latex"], "E = mc^2")
class TestBenchmarkIntegration(unittest.TestCase):
def test_omnidocbench_smoke_run_emits_metrics(self):
# Use a markdown source with a one-shot loader that tags the document
# as `omnidocbench`. Lets us exercise the full benchmark wiring (table +
# formula adapters, CSVs) without needing PyMuPDF to parse bytes.
ground_truth = {
"layout_dets": [
{
"category": "table",
"markdown": "| A | B |\n| --- | --- |\n| 1 | 2 |",
"page_num": 1,
},
{"category": "formula", "latex": "E = mc^2", "page_num": 1},
]
}
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
src = tmp / "in"
src.mkdir()
md_path = src / "doc.md"
md_path.write_text("# Doc\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8")
def fake_loader(root: Path):
yield DatasetDocument(
dataset_id="omnidocbench",
doc_id="doc",
path=md_path,
ground_truth=ground_truth,
metadata={},
)
register_dataset_loader("omnidocbench", fake_loader)
try:
summary = run_parser_benchmark(src, tmp / "out", dataset_name="omnidocbench")
finally:
from zsgdp.benchmarks.ground_truth import omnidocbench_layout_truths
# restore the real loader
from zsgdp.benchmarks.datasets import _load_omnidocbench
_DATASET_LOADERS["omnidocbench"] = _load_omnidocbench
self.assertEqual(summary["dataset_name"], "omnidocbench")
doc = summary["documents"][0]
self.assertTrue(doc["table_structure_evaluated"])
self.assertTrue(doc["formula_evaluated"])
self.assertTrue((tmp / "out" / "table_structure_runs.csv").exists())
self.assertTrue((tmp / "out" / "formula_runs.csv").exists())
if __name__ == "__main__":
unittest.main()