oneocr

File size: 10,525 Bytes

ce847d4

"""Compare ONNX engine vs original DLL engine on all images in working_space/input/.

Outputs a detailed comparison report and saves results side-by-side.
"""
import sys
import time
from difflib import SequenceMatcher
from pathlib import Path

from PIL import Image

sys.path.insert(0, str(Path(__file__).parent.parent))

from ocr.engine import OcrEngine
from ocr.engine_onnx import OcrEngineOnnx
from ocr.models import OcrResult


def fmt_result(r: OcrResult) -> dict:
    """Extract structured info from OcrResult for comparison."""
    lines_info = []
    for line in r.lines:
        words_info = []
        for w in line.words:
            words_info.append({
                "text": w.text,
                "conf": round(w.confidence, 3),
                "bbox": (
                    f"({w.bounding_rect.x1:.0f},{w.bounding_rect.y1:.0f})"
                    f"->({w.bounding_rect.x3:.0f},{w.bounding_rect.y3:.0f})"
                ) if w.bounding_rect else "none",
            })
        lines_info.append({
            "text": line.text,
            "words": words_info,
        })
    return {
        "text": r.text,
        "n_lines": len(r.lines),
        "n_words": sum(len(l.words) for l in r.lines),
        "avg_conf": round(r.average_confidence, 3),
        "angle": r.text_angle,
        "lines": lines_info,
        "error": r.error,
    }


def compare_texts(dll_text: str, onnx_text: str) -> tuple[bool, str, float]:
    """Compare two OCR texts, return (match, diff_description, similarity)."""
    if dll_text == onnx_text:
        return True, "IDENTICAL", 1.0

    # Normalize whitespace for soft match
    dll_norm = " ".join(dll_text.split())
    onnx_norm = " ".join(onnx_text.split())
    if dll_norm == onnx_norm:
        return True, "MATCH (whitespace diff only)", 1.0

    # Calculate char-level similarity
    ratio = SequenceMatcher(None, dll_norm, onnx_norm).ratio()

    # Case-insensitive
    if dll_norm.lower() == onnx_norm.lower():
        return False, f"CASE DIFF ONLY ({ratio:.1%})", ratio

    return False, f"MISMATCH ({ratio:.1%} similar)", ratio


def main():
    input_dir = Path("working_space/input")
    output_dir = Path("working_space/output")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Collect all image files
    image_files = sorted(input_dir.glob("*.png"))
    if not image_files:
        print("No images found in working_space/input/")
        return

    print("=" * 80)
    print("  ONEOCR: ONNX vs DLL ENGINE COMPARISON")
    print(f"  Images: {len(image_files)} from {input_dir}")
    print("=" * 80)

    # Initialize engines
    print("\n  Initializing engines...")
    dll_engine = OcrEngine()
    print("  ✓ DLL engine ready")
    onnx_engine = OcrEngineOnnx()
    print("  ✓ ONNX engine ready")

    # Results accumulator
    results = []
    match_count = 0
    total_count = 0

    report_lines = [
        "# ONEOCR: ONNX vs DLL Comparison Report",
        "",
        f"**Date:** 2026-02-11",
        f"**Images:** {len(image_files)}",
        "",
        "---",
        "",
    ]

    for img_path in image_files:
        total_count += 1
        img = Image.open(img_path)
        name = img_path.name
        w_img, h_img = img.size

        print(f"\n{'─' * 70}")
        print(f"  [{total_count}/{len(image_files)}] {name} ({w_img}×{h_img})")
        print(f"{'─' * 70}")

        # DLL
        t0 = time.perf_counter()
        dll_result = dll_engine.recognize_pil(img)
        t_dll = (time.perf_counter() - t0) * 1000

        # ONNX
        t0 = time.perf_counter()
        onnx_result = onnx_engine.recognize_pil(img)
        t_onnx = (time.perf_counter() - t0) * 1000

        dll_info = fmt_result(dll_result)
        onnx_info = fmt_result(onnx_result)

        match, diff_desc, similarity = compare_texts(dll_result.text, onnx_result.text)
        if match:
            match_count += 1
            status = "✅ MATCH"
        else:
            status = f"❌ {diff_desc}"

        dll_text_short = dll_result.text.replace('\n', ' ↵ ')[:80]
        onnx_text_short = onnx_result.text.replace('\n', ' ↵ ')[:80]

        print(f"  DLL:  \"{dll_text_short}\"")
        print(f"        Lines={dll_info['n_lines']}, Words={dll_info['n_words']}, "
              f"Conf={dll_info['avg_conf']:.1%}, Time={t_dll:.0f}ms")
        print(f"  ONNX: \"{onnx_text_short}\"")
        print(f"        Lines={onnx_info['n_lines']}, Words={onnx_info['n_words']}, "
              f"Conf={onnx_info['avg_conf']:.1%}, Time={t_onnx:.0f}ms")
        print(f"  Status: {status}")

        # Per-word comparison
        dll_words = [w.text for l in dll_result.lines for w in l.words]
        onnx_words = [w.text for l in onnx_result.lines for w in l.words]

        if not match:
            print(f"  DLL words:  {dll_words[:15]}{'...' if len(dll_words) > 15 else ''}")
            print(f"  ONNX words: {onnx_words[:15]}{'...' if len(onnx_words) > 15 else ''}")

        # Report
        dll_text_esc = dll_result.text.replace('|', '\\|').replace('\n', ' ↵ ')
        onnx_text_esc = onnx_result.text.replace('|', '\\|').replace('\n', ' ↵ ')

        report_lines.append(f"## {total_count}. {name} ({w_img}×{h_img})")
        report_lines.append(f"**Status:** {status}")
        report_lines.append("")
        report_lines.append("| | DLL (Original) | ONNX (Our) |")
        report_lines.append("|---|---|---|")
        report_lines.append(f"| Text | `{dll_text_esc}` | `{onnx_text_esc}` |")
        report_lines.append(f"| Lines | {dll_info['n_lines']} | {onnx_info['n_lines']} |")
        report_lines.append(f"| Words | {dll_info['n_words']} | {onnx_info['n_words']} |")
        report_lines.append(f"| Avg Conf | {dll_info['avg_conf']:.1%} | {onnx_info['avg_conf']:.1%} |")
        report_lines.append(f"| Angle | {dll_info['angle']} | {onnx_info['angle']} |")
        report_lines.append(f"| Time | {t_dll:.0f}ms | {t_onnx:.0f}ms |")
        report_lines.append("")

        # Word diff if mismatch
        if not match:
            report_lines.append("**Word-level diff:**")
            report_lines.append(f"- DLL:  `{' | '.join(dll_words)}`")
            report_lines.append(f"- ONNX: `{' | '.join(onnx_words)}`")
            report_lines.append("")

        # Per-line comparison
        max_lines = max(dll_info['n_lines'], onnx_info['n_lines'])
        if max_lines > 0:
            report_lines.append("**Per-line:**")
            report_lines.append("| Line | DLL | ONNX | Match |")
            report_lines.append("|---|---|---|---|")
            for li in range(max_lines):
                dll_lt = dll_info['lines'][li]['text'] if li < len(dll_info['lines']) else "(missing)"
                onnx_lt = onnx_info['lines'][li]['text'] if li < len(onnx_info['lines']) else "(missing)"
                line_match = "✅" if dll_lt == onnx_lt else "❌"
                dll_lt_esc = dll_lt.replace('|', '\\|')
                onnx_lt_esc = onnx_lt.replace('|', '\\|')
                report_lines.append(f"| L{li} | `{dll_lt_esc}` | `{onnx_lt_esc}` | {line_match} |")
            report_lines.append("")

        report_lines.append("---")
        report_lines.append("")

        results.append({
            "name": name,
            "match": match,
            "diff": diff_desc,
            "similarity": similarity,
            "dll_text": dll_result.text,
            "onnx_text": onnx_result.text,
            "dll_words": dll_words,
            "onnx_words": onnx_words,
            "dll_n_lines": dll_info['n_lines'],
            "onnx_n_lines": onnx_info['n_lines'],
        })

    # Summary
    print(f"\n{'=' * 80}")
    print(f"  SUMMARY: {match_count}/{total_count} images match "
          f"({match_count/total_count:.0%})")
    print(f"{'=' * 80}")

    mismatches = [r for r in results if not r['match']]
    if mismatches:
        avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
        print(f"\n  MISMATCHES ({len(mismatches)}), avg similarity: {avg_sim:.1%}:")
        for r in mismatches:
            dll_short = r['dll_text'].replace('\n', ' ↵ ')[:50]
            onnx_short = r['onnx_text'].replace('\n', ' ↵ ')[:50]
            print(f"    ❌ {r['name']}: {r['diff']}")
            print(f"       DLL:  \"{dll_short}\"")
            print(f"       ONNX: \"{onnx_short}\"")

    # Append summary to report
    report_lines.append(f"## Summary")
    report_lines.append("")
    report_lines.append(f"- **Total images:** {total_count}")
    report_lines.append(f"- **Matches:** {match_count}")
    report_lines.append(f"- **Mismatches:** {total_count - match_count}")
    report_lines.append(f"- **Match rate:** {match_count/total_count:.0%}")
    report_lines.append("")

    if mismatches:
        avg_sim = sum(r['similarity'] for r in mismatches) / len(mismatches)
        report_lines.append(f"### Avg mismatch similarity: {avg_sim:.1%}")
        report_lines.append("")
        report_lines.append("### Mismatched images")
        report_lines.append("| # | Image | DLL Text | ONNX Text | Similarity |")
        report_lines.append("|---|---|---|---|---|")
        for i, r in enumerate(mismatches):
            dll_s = r['dll_text'].replace('\n', ' ↵ ').replace('|', '\\|')[:40]
            onnx_s = r['onnx_text'].replace('\n', ' ↵ ').replace('|', '\\|')[:40]
            report_lines.append(
                f"| {i+1} | {r['name']} | `{dll_s}` | `{onnx_s}` | {r['similarity']:.1%} |"
            )
        report_lines.append("")

    # Common issues analysis
    report_lines.append("### Common Issue Patterns")
    report_lines.append("")

    # Categorize mismatches
    extra_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] > r['dll_n_lines'])
    fewer_lines_onnx = sum(1 for r in mismatches if r['onnx_n_lines'] < r['dll_n_lines'])
    same_lines = sum(1 for r in mismatches if r['onnx_n_lines'] == r['dll_n_lines'])

    report_lines.append(f"- ONNX detects MORE lines than DLL: {extra_lines_onnx} cases")
    report_lines.append(f"- ONNX detects FEWER lines than DLL: {fewer_lines_onnx} cases")
    report_lines.append(f"- Same line count but different text: {same_lines} cases")

    # Insert match rate at top
    report_lines.insert(4, f"**Match Rate:** {match_count}/{total_count} ({match_count/total_count:.0%})")

    report_path = output_dir / "comparison_report.md"
    report_path.write_text("\n".join(report_lines), encoding="utf-8")
    print(f"\n  Report saved: {report_path}")


if __name__ == "__main__":
    main()