oneocr / _archive /test_compare.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Compare ONNX engine vs original DLL engine on all images in working_space/input/.
Outputs a detailed comparison report and saves results side-by-side.
"""
import sys
import time
from difflib import SequenceMatcher
from pathlib import Path
from PIL import Image
sys.path.insert(0, str(Path(__file__).parent.parent))
from ocr.engine import OcrEngine
from ocr.engine_onnx import OcrEngineOnnx
from ocr.models import OcrResult
def fmt_result(r: OcrResult) -> dict:
"""Extract structured info from OcrResult for comparison."""
lines_info = []
for line in r.lines:
words_info = []
for w in line.words:
words_info.append({
"text": w.text,
"conf": round(w.confidence, 3),
"bbox": (
f"({w.bounding_rect.x1:.0f},{w.bounding_rect.y1:.0f})"
f"->({w.bounding_rect.x3:.0f},{w.bounding_rect.y3:.0f})"
) if w.bounding_rect else "none",
})
lines_info.append({
"text": line.text,
"words": words_info,
})
return {
"text": r.text,
"n_lines": len(r.lines),
"n_words": sum(len(l.words) for l in r.lines),
"avg_conf": round(r.average_confidence, 3),
"angle": r.text_angle,
"lines": lines_info,
"error": r.error,
}
def compare_texts(dll_text: str, onnx_text: str) -> tuple[bool, str, float]:
"""Compare two OCR texts, return (match, diff_description, similarity)."""
if dll_text == onnx_text:
return True, "IDENTICAL", 1.0
# Normalize whitespace for soft match
dll_norm = " ".join(dll_text.split())
onnx_norm = " ".join(onnx_text.split())
if dll_norm == onnx_norm:
return True, "MATCH (whitespace diff only)", 1.0
# Calculate char-level similarity
ratio = SequenceMatcher(None, dll_norm, onnx_norm).ratio()
# Case-insensitive
if dll_norm.lower() == onnx_norm.lower():
return False, f"CASE DIFF ONLY ({ratio:.1%})", ratio
return False, f"MISMATCH ({ratio:.1%} similar)", ratio
def main():
input_dir = Path("working_space/input")
output_dir = Path("working_space/output")
output_dir.mkdir(parents=True, exist_ok=True)
# Collect all image files
image_files = sorted(input_dir.glob("*.png"))
if not image_files:
print("No images found in working_space/input/")
return
print("=" * 80)
print(" ONEOCR: ONNX vs DLL ENGINE COMPARISON")
print(f" Images: {len(image_files)} from {input_dir}")
print("=" * 80)
# Initialize engines
print("\n Initializing engines...")
dll_engine = OcrEngine()
print(" ✓ DLL engine ready")
onnx_engine = OcrEngineOnnx()
print(" ✓ ONNX engine ready")
# Results accumulator
results = []
match_count = 0
total_count = 0
report_lines = [
"# ONEOCR: ONNX vs DLL Comparison Report",
"",
f"**Date:** 2026-02-11",
f"**Images:** {len(image_files)}",
"",
"---",
"",
]
for img_path in image_files:
total_count += 1
img = Image.open(img_path)
name = img_path.name
w_img, h_img = img.size
print(f"\n{'─' * 70}")
print(f" [{total_count}/{len(image_files)}] {name} ({w_img}×{h_img})")
print(f"{'─' * 70}")
# DLL
t0 = time.perf_counter()
dll_result = dll_engine.recognize_pil(img)
t_dll = (time.perf_counter() - t0) * 1000
# ONNX
t0 = time.perf_counter()
onnx_result = onnx_engine.recognize_pil(img)
t_onnx = (time.perf_counter() - t0) * 1000
dll_info = fmt_result(dll_result)
onnx_info = fmt_result(onnx_result)
match, diff_desc, similarity = compare_texts(dll_result.text, onnx_result.text)
if match:
match_count += 1
status = "✅ MATCH"
else:
status = f"❌ {diff_desc}"
dll_text_short = dll_result.text.replace('\n', ' ↵ ')[:80]
onnx_text_short = onnx_result.text.replace('\n', ' ↵ ')[:80]
print(f" DLL: \"{dll_text_short}\"")
print(f" Lines={dll_info['n_lines']}, Words={dll_info['n_words']}, "
f"Conf={dll_info['avg_conf']:.1%}, Time={t_dll:.0f}ms")
print(f" ONNX: \"{onnx_text_short}\"")
print(f" Lines={onnx_info['n_lines']}, Words={onnx_info['n_words']}, "
f"Conf={onnx_info['avg_conf']:.1%}, Time={t_onnx:.0f}ms")
print(f" Status: {status}")
# Per-word comparison
dll_words = [w.text for l in dll_result.lines for w in l.words]
onnx_words = [w.text for l in onnx_result.lines for w in l.words]
if not match:
print(f" DLL words: {dll_words[:15]}{'...' if len(dll_words) > 15 else ''}")
print(f" ONNX words: {onnx_words[:15]}{'...' if len(onnx_words) > 15 else ''}")
# Report
dll_text_esc = dll_result.text.replace('|', '\\|').replace('\n', ' ↵ ')
onnx_text_esc = onnx_result.text.replace('|', '\\|').replace('\n', ' ↵ ')
report_lines.append(f"## {total_count}. {name} ({w_img}×{h_img})")
report_lines.append(f"**Status:** {status}")
report_lines.append("")
report_lines.append("| | DLL (Original) | ONNX (Our) |")
report_lines.append("|---|---|---|")
report_lines.append(f"| Text | `{dll_text_esc}` | `{onnx_text_esc}` |")
report_lines.append(f"| Lines | {dll_info['n_lines']} | {onnx_info['n_lines']} |")
report_lines.append(f"| Words | {dll_info['n_words']} | {onnx_info['n_words']} |")
report_lines.append(f"| Avg Conf | {dll_info['avg_conf']:.1%} | {onnx_info['avg_conf']:.1%} |")
report_lines.append(f"| Angle | {dll_info['angle']} | {onnx_info['angle']} |")
report_lines.append(f"| Time | {t_dll:.0f}ms | {t_onnx:.0f}ms |")
report_lines.append("")
# Word diff if mismatch
if not match:
report_lines.append("**Word-level diff:**")
report_lines.append(f"- DLL: `{' | '.join(dll_words)}`")
report_lines.append(f"- ONNX: `{' | '.join(onnx_words)}`")
report_lines.append("")
# Per-line comparison
max_lines = max(dll_info['n_lines'], onnx_info['n_lines'])
if max_lines > 0:
report_lines.append("**Per-line:**")
report_lines.append("| Line | DLL | ONNX | Match |")
report_lines.append("|---|---|---|---|")
for li in range(max_lines):
dll_lt = dll_info['lines'][li]['text'] if li < len(dll_info['lines']) else "(missing)"
onnx_lt = onnx_info['lines'][li]['text'] if li < len(onnx_info['lines']) else "(missing)"
line_match = "✅" if dll_lt == onnx_lt else "❌"
dll_lt_esc = dll_lt.replace('|', '\\|')
onnx_lt_esc = onnx_lt.replace('|', '\\|')
report_lines.append(f"| L{li} | `{dll_lt_esc}` | `{onnx_lt_esc}` | {line_match} |")
report_lines.append("")
report_lines.append("---")
report_lines.append("")
results.append({
"name": name,
"match": match,
"diff": diff_desc,
"similarity": similarity,
"dll_text": dll_result.text,
"onnx_text": onnx_result.text,
"dll_words": dll_words,
"onnx_words": onnx_words,
"dll_n_lines": dll_info['n_lines'],
"onnx_n_lines": onnx_info['n_lines'],
})
# Summary
print(f"\n{'=' * 80}")
print(f" SUMMARY: {match_count}/{total_count} images match "
f"({match_count/total_count:.0%})")
print(f"{'=' * 80}")
mismatches = [r for r in results if not r['match']]
if mismatches:
avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
print(f"\n MISMATCHES ({len(mismatches)}), avg similarity: {avg_sim:.1%}:")
for r in mismatches:
dll_short = r['dll_text'].replace('\n', ' ↵ ')[:50]
onnx_short = r['onnx_text'].replace('\n', ' ↵ ')[:50]
print(f" ❌ {r['name']}: {r['diff']}")
print(f" DLL: \"{dll_short}\"")
print(f" ONNX: \"{onnx_short}\"")
# Append summary to report
report_lines.append(f"## Summary")
report_lines.append("")
report_lines.append(f"- **Total images:** {total_count}")
report_lines.append(f"- **Matches:** {match_count}")
report_lines.append(f"- **Mismatches:** {total_count - match_count}")
report_lines.append(f"- **Match rate:** {match_count/total_count:.0%}")
report_lines.append("")
if mismatches:
avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
report_lines.append(f"### Avg mismatch similarity: {avg_sim:.1%}")
report_lines.append("")
report_lines.append("### Mismatched images")
report_lines.append("| # | Image | DLL Text | ONNX Text | Similarity |")
report_lines.append("|---|---|---|---|---|")
for i, r in enumerate(mismatches):
dll_s = r['dll_text'].replace('\n', ' ↵ ').replace('|', '\\|')[:40]
onnx_s = r['onnx_text'].replace('\n', ' ↵ ').replace('|', '\\|')[:40]
report_lines.append(
f"| {i+1} | {r['name']} | `{dll_s}` | `{onnx_s}` | {r['similarity']:.1%} |"
)
report_lines.append("")
# Common issues analysis
report_lines.append("### Common Issue Patterns")
report_lines.append("")
# Categorize mismatches
extra_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] > r['dll_n_lines'])
fewer_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] < r['dll_n_lines'])
same_lines = sum(1 for r in mismatches if r['onnx_n_lines'] == r['dll_n_lines'])
report_lines.append(f"- ONNX detects MORE lines than DLL: {extra_lines_onnx} cases")
report_lines.append(f"- ONNX detects FEWER lines than DLL: {fewer_lines_onnx} cases")
report_lines.append(f"- Same line count but different text: {same_lines} cases")
# Insert match rate at top
report_lines.insert(4, f"**Match Rate:** {match_count}/{total_count} ({match_count/total_count:.0%})")
report_path = output_dir / "comparison_report.md"
report_path.write_text("\n".join(report_lines), encoding="utf-8")
print(f"\n Report saved: {report_path}")
if __name__ == "__main__":
main()