Spaces:

jw-tools
/

jw-search

Running

App Files Files Community

jw-search / scripts /run-ocr-benchmark.py

jw-tools

deploy: latest main (lazy-ML cold start, durable launcher, web-image search, scene search) + full-app data refresh

7ea1851 verified about 13 hours ago

raw

history blame contribute delete

1.96 kB

	#!/usr/bin/env python3
	"""
	Score OCR predictions against a ground-truth annotation file.

	This gives us a stable benchmark before wiring OCR into production indexing.
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from pathlib import Path


	ROOT_DIR = Path(__file__).resolve().parents[1]
	BACKEND_DIR = ROOT_DIR / "backend"
	if str(BACKEND_DIR) not in sys.path:
	sys.path.insert(0, str(BACKEND_DIR))

	from ocr_ground_truth import build_benchmark_report, load_ground_truth, load_predictions # noqa: E402


	def build_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(description="Score OCR predictions against Search-UI ground truth.")
	parser.add_argument("--ground-truth", required=True, type=Path, help="Ground-truth JSON file")
	parser.add_argument("--predictions", required=True, type=Path, help="Predictions JSON file")
	parser.add_argument(
	"--report",
	type=Path,
	default=ROOT_DIR / "docs" / "reports" / "ocr-benchmark-report.json",
	help="Where to write the report JSON",
	)
	return parser


	def main() -> int:
	args = build_parser().parse_args()
	annotations = load_ground_truth(args.ground_truth)
	predictions = load_predictions(args.predictions)
	report = build_benchmark_report(annotations, predictions)

	args.report.parent.mkdir(parents=True, exist_ok=True)
	args.report.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")

	summary = report["summary"]
	print(
	"OCR benchmark:"
	f" segments={summary['segments']}"
	f" exact_match_rate={summary['exact_match_rate']}"
	f" avg_similarity={summary['average_normalized_edit_similarity']}"
	f" junk_rate={summary['junk_prediction_rate']}"
	f" missing_rate={summary['missing_prediction_rate']}"
	)
	print(f"Report: {args.report}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())