oneocr / _archive /test_compare.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 about 24 hours ago

10.5 kB

	"""Compare ONNX engine vs original DLL engine on all images in working_space/input/.

	Outputs a detailed comparison report and saves results side-by-side.
	"""
	import sys
	import time
	from difflib import SequenceMatcher
	from pathlib import Path

	from PIL import Image

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from ocr.engine import OcrEngine
	from ocr.engine_onnx import OcrEngineOnnx
	from ocr.models import OcrResult


	def fmt_result(r: OcrResult) -> dict:
	"""Extract structured info from OcrResult for comparison."""
	lines_info = []
	for line in r.lines:
	words_info = []
	for w in line.words:
	words_info.append({
	"text": w.text,
	"conf": round(w.confidence, 3),
	"bbox": (
	f"({w.bounding_rect.x1:.0f},{w.bounding_rect.y1:.0f})"
	f"->({w.bounding_rect.x3:.0f},{w.bounding_rect.y3:.0f})"
	) if w.bounding_rect else "none",
	})
	lines_info.append({
	"text": line.text,
	"words": words_info,
	})
	return {
	"text": r.text,
	"n_lines": len(r.lines),
	"n_words": sum(len(l.words) for l in r.lines),
	"avg_conf": round(r.average_confidence, 3),
	"angle": r.text_angle,
	"lines": lines_info,
	"error": r.error,
	}


	def compare_texts(dll_text: str, onnx_text: str) -> tuple[bool, str, float]:
	"""Compare two OCR texts, return (match, diff_description, similarity)."""
	if dll_text == onnx_text:
	return True, "IDENTICAL", 1.0

	# Normalize whitespace for soft match
	dll_norm = " ".join(dll_text.split())
	onnx_norm = " ".join(onnx_text.split())
	if dll_norm == onnx_norm:
	return True, "MATCH (whitespace diff only)", 1.0

	# Calculate char-level similarity
	ratio = SequenceMatcher(None, dll_norm, onnx_norm).ratio()

	# Case-insensitive
	if dll_norm.lower() == onnx_norm.lower():
	return False, f"CASE DIFF ONLY ({ratio:.1%})", ratio

	return False, f"MISMATCH ({ratio:.1%} similar)", ratio


	def main():
	input_dir = Path("working_space/input")
	output_dir = Path("working_space/output")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Collect all image files
	image_files = sorted(input_dir.glob("*.png"))
	if not image_files:
	print("No images found in working_space/input/")
	return

	print("=" * 80)
	print(" ONEOCR: ONNX vs DLL ENGINE COMPARISON")
	print(f" Images: {len(image_files)} from {input_dir}")
	print("=" * 80)

	# Initialize engines
	print("\n Initializing engines...")
	dll_engine = OcrEngine()
	print(" ✓ DLL engine ready")
	onnx_engine = OcrEngineOnnx()
	print(" ✓ ONNX engine ready")

	# Results accumulator
	results = []
	match_count = 0
	total_count = 0

	report_lines = [
	"# ONEOCR: ONNX vs DLL Comparison Report",
	"",
	f"Date: 2026-02-11",
	f"Images: {len(image_files)}",
	"",
	"---",
	"",
	]

	for img_path in image_files:
	total_count += 1
	img = Image.open(img_path)
	name = img_path.name
	w_img, h_img = img.size

	print(f"\n{'─' * 70}")
	print(f" [{total_count}/{len(image_files)}] {name} ({w_img}×{h_img})")
	print(f"{'─' * 70}")

	# DLL
	t0 = time.perf_counter()
	dll_result = dll_engine.recognize_pil(img)
	t_dll = (time.perf_counter() - t0) * 1000

	# ONNX
	t0 = time.perf_counter()
	onnx_result = onnx_engine.recognize_pil(img)
	t_onnx = (time.perf_counter() - t0) * 1000

	dll_info = fmt_result(dll_result)
	onnx_info = fmt_result(onnx_result)

	match, diff_desc, similarity = compare_texts(dll_result.text, onnx_result.text)
	if match:
	match_count += 1
	status = "✅ MATCH"
	else:
	status = f"❌ {diff_desc}"

	dll_text_short = dll_result.text.replace('\n', ' ↵ ')[:80]
	onnx_text_short = onnx_result.text.replace('\n', ' ↵ ')[:80]

	print(f" DLL: \"{dll_text_short}\"")
	print(f" Lines={dll_info['n_lines']}, Words={dll_info['n_words']}, "
	f"Conf={dll_info['avg_conf']:.1%}, Time={t_dll:.0f}ms")
	print(f" ONNX: \"{onnx_text_short}\"")
	print(f" Lines={onnx_info['n_lines']}, Words={onnx_info['n_words']}, "
	f"Conf={onnx_info['avg_conf']:.1%}, Time={t_onnx:.0f}ms")
	print(f" Status: {status}")

	# Per-word comparison
	dll_words = [w.text for l in dll_result.lines for w in l.words]
	onnx_words = [w.text for l in onnx_result.lines for w in l.words]

	if not match:
	print(f" DLL words: {dll_words[:15]}{'...' if len(dll_words) > 15 else ''}")
	print(f" ONNX words: {onnx_words[:15]}{'...' if len(onnx_words) > 15 else ''}")

	# Report
	dll_text_esc = dll_result.text.replace('\|', '\\\|').replace('\n', ' ↵ ')
	onnx_text_esc = onnx_result.text.replace('\|', '\\\|').replace('\n', ' ↵ ')

	report_lines.append(f"## {total_count}. {name} ({w_img}×{h_img})")
	report_lines.append(f"Status: {status}")
	report_lines.append("")
	report_lines.append("\| \| DLL (Original) \| ONNX (Our) \|")
	report_lines.append("\|---\|---\|---\|")
	report_lines.append(f"\| Text \| `{dll_text_esc}` \| `{onnx_text_esc}` \|")
	report_lines.append(f"\| Lines \| {dll_info['n_lines']} \| {onnx_info['n_lines']} \|")
	report_lines.append(f"\| Words \| {dll_info['n_words']} \| {onnx_info['n_words']} \|")
	report_lines.append(f"\| Avg Conf \| {dll_info['avg_conf']:.1%} \| {onnx_info['avg_conf']:.1%} \|")
	report_lines.append(f"\| Angle \| {dll_info['angle']} \| {onnx_info['angle']} \|")
	report_lines.append(f"\| Time \| {t_dll:.0f}ms \| {t_onnx:.0f}ms \|")
	report_lines.append("")

	# Word diff if mismatch
	if not match:
	report_lines.append("Word-level diff:")
	report_lines.append(f"- DLL: `{' \| '.join(dll_words)}`")
	report_lines.append(f"- ONNX: `{' \| '.join(onnx_words)}`")
	report_lines.append("")

	# Per-line comparison
	max_lines = max(dll_info['n_lines'], onnx_info['n_lines'])
	if max_lines > 0:
	report_lines.append("Per-line:")
	report_lines.append("\| Line \| DLL \| ONNX \| Match \|")
	report_lines.append("\|---\|---\|---\|---\|")
	for li in range(max_lines):
	dll_lt = dll_info['lines'][li]['text'] if li < len(dll_info['lines']) else "(missing)"
	onnx_lt = onnx_info['lines'][li]['text'] if li < len(onnx_info['lines']) else "(missing)"
	line_match = "✅" if dll_lt == onnx_lt else "❌"
	dll_lt_esc = dll_lt.replace('\|', '\\\|')
	onnx_lt_esc = onnx_lt.replace('\|', '\\\|')
	report_lines.append(f"\| L{li} \| `{dll_lt_esc}` \| `{onnx_lt_esc}` \| {line_match} \|")
	report_lines.append("")

	report_lines.append("---")
	report_lines.append("")

	results.append({
	"name": name,
	"match": match,
	"diff": diff_desc,
	"similarity": similarity,
	"dll_text": dll_result.text,
	"onnx_text": onnx_result.text,
	"dll_words": dll_words,
	"onnx_words": onnx_words,
	"dll_n_lines": dll_info['n_lines'],
	"onnx_n_lines": onnx_info['n_lines'],
	})

	# Summary
	print(f"\n{'=' * 80}")
	print(f" SUMMARY: {match_count}/{total_count} images match "
	f"({match_count/total_count:.0%})")
	print(f"{'=' * 80}")

	mismatches = [r for r in results if not r['match']]
	if mismatches:
	avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
	print(f"\n MISMATCHES ({len(mismatches)}), avg similarity: {avg_sim:.1%}:")
	for r in mismatches:
	dll_short = r['dll_text'].replace('\n', ' ↵ ')[:50]
	onnx_short = r['onnx_text'].replace('\n', ' ↵ ')[:50]
	print(f" ❌ {r['name']}: {r['diff']}")
	print(f" DLL: \"{dll_short}\"")
	print(f" ONNX: \"{onnx_short}\"")

	# Append summary to report
	report_lines.append(f"## Summary")
	report_lines.append("")
	report_lines.append(f"- Total images: {total_count}")
	report_lines.append(f"- Matches: {match_count}")
	report_lines.append(f"- Mismatches: {total_count - match_count}")
	report_lines.append(f"- Match rate: {match_count/total_count:.0%}")
	report_lines.append("")

	if mismatches:
	avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
	report_lines.append(f"### Avg mismatch similarity: {avg_sim:.1%}")
	report_lines.append("")
	report_lines.append("### Mismatched images")
	report_lines.append("\| # \| Image \| DLL Text \| ONNX Text \| Similarity \|")
	report_lines.append("\|---\|---\|---\|---\|---\|")
	for i, r in enumerate(mismatches):
	dll_s = r['dll_text'].replace('\n', ' ↵ ').replace('\|', '\\\|')[:40]
	onnx_s = r['onnx_text'].replace('\n', ' ↵ ').replace('\|', '\\\|')[:40]
	report_lines.append(
	f"\| {i+1} \| {r['name']} \| `{dll_s}` \| `{onnx_s}` \| {r['similarity']:.1%} \|"
	)
	report_lines.append("")

	# Common issues analysis
	report_lines.append("### Common Issue Patterns")
	report_lines.append("")

	# Categorize mismatches
	extra_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] > r['dll_n_lines'])
	fewer_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] < r['dll_n_lines'])
	same_lines = sum(1 for r in mismatches if r['onnx_n_lines'] == r['dll_n_lines'])

	report_lines.append(f"- ONNX detects MORE lines than DLL: {extra_lines_onnx} cases")
	report_lines.append(f"- ONNX detects FEWER lines than DLL: {fewer_lines_onnx} cases")
	report_lines.append(f"- Same line count but different text: {same_lines} cases")

	# Insert match rate at top
	report_lines.insert(4, f"Match Rate: {match_count}/{total_count} ({match_count/total_count:.0%})")

	report_path = output_dir / "comparison_report.md"
	report_path.write_text("\n".join(report_lines), encoding="utf-8")
	print(f"\n Report saved: {report_path}")


	if __name__ == "__main__":
	main()