OneOCR Dev
commited on
Commit
·
be4a6f1
1
Parent(s):
d2178ab
feat: Wine bridge - run DLL on Linux via Wine (100% accuracy)
Browse files- tools/wine_bridge.py: C loader + Python bridge (subprocess wine64)
- tools/oneocr_loader.c: minimal C program loading oneocr.dll
- tools/oneocr_loader.exe: pre-compiled loader (tested 19/19 identical to DLL)
- ocr/engine_unified.py: auto-selects backend (DLL -> Wine -> ONNX)
- Dockerfile: Ubuntu 24.04 + Wine + MinGW ready-to-run
- test_wine_colab.ipynb: Google Colab notebook for Linux testing
- Updated main.py with --backend flag and unified engine
- Updated README.md with Linux setup docs
- .dockerignore +10 -0
- BRAINSTORM_ONEOCR_ACCURACY.md +540 -0
- BRAINSTORM_ONEOCR_ACCURACY_SUMMARY.md +50 -0
- Dockerfile +84 -0
- README.md +96 -9
- main.py +55 -38
- ocr/__init__.py +6 -1
- ocr/engine_unified.py +206 -0
- test_wine_colab.ipynb +223 -0
- tools/oneocr_loader.c +292 -0
- tools/oneocr_loader.exe +0 -0
- tools/wine_bridge.py +567 -0
.dockerignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.git/
|
| 5 |
+
_archive/
|
| 6 |
+
working_space/output/
|
| 7 |
+
*.egg-info/
|
| 8 |
+
.mypy_cache/
|
| 9 |
+
.ruff_cache/
|
| 10 |
+
BRAINSTORM_*.md
|
BRAINSTORM_ONEOCR_ACCURACY.md
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BRAINSTORM: OneOCR ONNX Pipeline — Closing the 53% → ~100% Accuracy Gap
|
| 2 |
+
|
| 3 |
+
**Date:** 2025-01-XX
|
| 4 |
+
**Scope:** How to improve OneOCR's pure-Python ONNX pipeline to match the DLL's accuracy
|
| 5 |
+
**Current match rate:** 53% (10/19 test images match DLL output exactly)
|
| 6 |
+
**Target:** ≥95% match rate with DLL
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## FAZA 0: Context & Problem Definition
|
| 11 |
+
|
| 12 |
+
### Co mamy
|
| 13 |
+
|
| 14 |
+
| Component | Status | Notes |
|
| 15 |
+
|---|---|---|
|
| 16 |
+
| 34 ONNX models extracted | ✅ | AES-256-CFB128 decrypted, custom ops removed |
|
| 17 |
+
| Detector (model_00) | ✅ Working | PixelLink FPN, 3 scales (stride 4/8/16) |
|
| 18 |
+
| ScriptID (model_01) | ✅ Working | 10-class script classifier |
|
| 19 |
+
| Recognizers (02-10) | ✅ Working | Per-script CTC (CRNN-style), 9 scripts |
|
| 20 |
+
| Rejection models (11-21) | 🔓 Unlocked | 11 binary rejection classifiers — NOT integrated |
|
| 21 |
+
| Confidence models (22-32) | 🔓 Unlocked | 11 confidence calibrators — NOT integrated |
|
| 22 |
+
| LineLayout (model_33) | 🔓 Unlocked | Line segmentation model — minimally integrated |
|
| 23 |
+
| AuxMltCls (model_34) | 🔓 Unlocked | Script/handwriting classifier — NOT integrated |
|
| 24 |
+
| Protobuf config | ✅ Decoded | Full manifest with thresholds, calibrations |
|
| 25 |
+
| Score calibration files | ✅ Extracted | Platt scaling / temperature scaling params |
|
| 26 |
+
| DLL RE (reverse engineering) | ✅ Complete | ~600 C++ functions mapped, key classes identified |
|
| 27 |
+
|
| 28 |
+
### Gdzie jest luka (gap analysis)
|
| 29 |
+
|
| 30 |
+
Porównując output DLL vs ONNX pipeline na 19 obrazach testowych:
|
| 31 |
+
|
| 32 |
+
| Problem Category | Impact | Root Cause | Affected Images |
|
| 33 |
+
|---|---|---|---|
|
| 34 |
+
| **Missing detections** | ~25% of gap | PixelLink decoding + bbox_deltas regression | small text, dense text |
|
| 35 |
+
| **False positive detections** | ~15% of gap | No rejection model filtering | manga panels, backgrounds |
|
| 36 |
+
| **Wrong script routing** | ~10% of gap | AuxMltCls not used, ScriptID threshold tuning | CJK/handwritten |
|
| 37 |
+
| **Poor line grouping** | ~15% of gap | Heuristic Y-overlap instead of LineLayout model | multi-column, overlapping |
|
| 38 |
+
| **Crop quality** | ~15% of gap | Simplified padding/cropping vs DLL's adaptive | rotated/curved text |
|
| 39 |
+
| **Missing confidence filtering** | ~10% of gap | No confidence/rejection model cascade | noise, border artifacts |
|
| 40 |
+
| **Score calibration** | ~10% of gap | Raw scores used, no Platt/temperature scaling | threshold sensitivity |
|
| 41 |
+
|
| 42 |
+
### Kluczowe klasy DLL (z reverse engineering)
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
OneOCR::AVTextDetector → Detector + PixelLink + SeglinkProposals
|
| 46 |
+
OneOCR::AVBaseNormalizer → Adaptive text line normalization
|
| 47 |
+
OneOCR::AVTextLineRecognizer → CRNN recognition + rejection pipeline
|
| 48 |
+
OneOCR::AVConfidenceProto → Confidence model integration
|
| 49 |
+
OneOCR::AVRejectionProto → Rejection model cascade
|
| 50 |
+
OneOCR::AVLineLayoutClassifier → ML-based line segmentation (model 33)
|
| 51 |
+
OneOCR::AVAuxMltClsClassifier → Multi-script/handwriting classification
|
| 52 |
+
OneOCR::AVFontClassifier → Font type classification
|
| 53 |
+
OneOCR::AVPipeline → Orchestration + scheduling + batching
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## FAZA 1: Problem Statement
|
| 59 |
+
|
| 60 |
+
### Jeden zdanie
|
| 61 |
+
**Jak domknąć lukę jakości między naszym Python/ONNX pipeline (53% match) a DLL (100%) przy użyciu publicznie dostępnych badań, zextrahowanych modeli, i danych konfiguracyjnych — bez dostępu do kodu źródłowego DLL?**
|
| 62 |
+
|
| 63 |
+
### Constraints
|
| 64 |
+
1. **Brak dostępu do źródeł DLL** — mamy tylko reverse engineering (demangled names, strings, constants)
|
| 65 |
+
2. **Modele 11-34 unlocked** — ale nie wiemy jak dokładnie DLL ich używa (kolejność, wejścia)
|
| 66 |
+
3. **Cross-platform** — rozwiązanie musi działać na Linux/macOS (nie tylko Windows)
|
| 67 |
+
4. **Performance budget** — max ~2× slower than current pipeline (131-1079ms per image)
|
| 68 |
+
5. **No retraining** — używamy istniejących modeli as-is
|
| 69 |
+
|
| 70 |
+
### Success Criteria
|
| 71 |
+
- ≥95% exact text match with DLL output on test set
|
| 72 |
+
- ≥90% bbox IoU match with DLL output
|
| 73 |
+
- Maintains cross-platform compatibility
|
| 74 |
+
- No dependency on proprietary code
|
| 75 |
+
|
| 76 |
+
---
|
| 77 |
+
|
| 78 |
+
## FAZA 2: Ideas Generation (20 ideas)
|
| 79 |
+
|
| 80 |
+
### A. Detection Improvements
|
| 81 |
+
|
| 82 |
+
| # | Idea | Effort | Expected Impact | Source |
|
| 83 |
+
|---|---|---|---|---|
|
| 84 |
+
| 1 | **Implement proper SegLink-style cross-layer linking** — add cross-scale segment connections between FPN2↔FPN3↔FPN4 like original SegLink paper (4 cross-links per node, stride 2× offset) | HIGH | +8-10% | SegLink paper (Shi 2017) |
|
| 85 |
+
| 2 | **Apply score calibration from chunk_34/35** — Parse `.calibration.txt` files and apply Platt scaling to pixel_scores before thresholding. Per-FPN-level calibration (manifest shows P2=0.7, P3=0.8, P4=0.8) | LOW | +5-7% | Manifest protobuf |
|
| 86 |
+
| 3 | **Use per-level thresholds from manifest** — Currently using flat 0.7; DLL uses P2=0.7, P3=0.8, P4=0.8 for pixel threshold, and 0.2 for NMS IoU | LOW | +3-5% | Manifest field 9 |
|
| 87 |
+
| 4 | **Implement oriented bbox regression** — Current code reduces 4 corners to axis-aligned rect. DLL's SeglinkProposals keeps oriented boxes via corner regression averaging per component | MED | +5-8% | PixelLink++ / RE analysis |
|
| 88 |
+
| 5 | **Add checkbox/special region detector** — Manifest references `checkbox_cal.txt`, DLL has `AVCheckboxDetectorProto` | LOW | +1-2% | Manifest field 22 |
|
| 89 |
+
|
| 90 |
+
### B. Recognition Pipeline
|
| 91 |
+
|
| 92 |
+
| # | Idea | Effort | Expected Impact | Source |
|
| 93 |
+
|---|---|---|---|---|
|
| 94 |
+
| 6 | **Integrate rejection models (11-21)** — Run binary classifier after CTC decode, filter false-positive recognitions using per-script thresholds from manifest (e.g. Latin=0.161/0.0881, CJK=0.2548) | MED | +8-12% | Manifest field 7, RE |
|
| 95 |
+
| 7 | **Integrate confidence models (22-32)** — Per-script confidence calibration with threshold=0.5 (manifest field 9, all scripts) | MED | +5-8% | Manifest field 9 |
|
| 96 |
+
| 8 | **Use AuxMltCls (model_34) for script routing** — Replace simple ScriptID with multi-class classifier including handwritten detection. Manifest shows thresholds: 4.1 (printed), -2.0 (handwritten) | MED | +5-7% | Manifest field 20 |
|
| 97 |
+
| 9 | **Apply composite_chars_map** — Manifest shows Cyrillic and Hebrew have `composite_chars_map` files for multi-character mappings | LOW | +2-3% | Manifest field 12 |
|
| 98 |
+
| 10 | **Implement adaptive CTC seq_lengths** — Use rnn_info files to set proper sequence lengths per script (different stride ratios) | LOW | +2-4% | rnn_info files |
|
| 99 |
+
|
| 100 |
+
### C. Line Layout & Grouping
|
| 101 |
+
|
| 102 |
+
| # | Idea | Effort | Expected Impact | Source |
|
| 103 |
+
|---|---|---|---|---|
|
| 104 |
+
| 11 | **Full LineLayout model integration (model_33)** — Replace Y-overlap heuristic with ML-based line boundary prediction. DLL manifest shows CJK config: line_gap=2.85, line_merge=3.1 | MED | +8-12% | Manifest field 13 |
|
| 105 |
+
| 12 | **Reading order estimation** — DLL has `AVPODReadingOrderProto`, implement Z-order / column detection | HIGH | +3-5% | DLL classes |
|
| 106 |
+
| 13 | **Region grouping** — DLL has `AVPODRegionGroupingProto` for multi-column layout detection | HIGH | +3-5% | DLL classes |
|
| 107 |
+
|
| 108 |
+
### D. Preprocessing & Normalization
|
| 109 |
+
|
| 110 |
+
| # | Idea | Effort | Expected Impact | Source |
|
| 111 |
+
|---|---|---|---|---|
|
| 112 |
+
| 14 | **Implement BaseNormalizer-style adaptive cropping** — DLL's `AVBaseNormalizer` dynamically adjusts padding based on text density and line height | MED | +5-7% | DLL class |
|
| 113 |
+
| 15 | **Proper rotation handling** — Use detector's vertical outputs (already extracted) instead of h>2w heuristic | LOW | +3-5% | Detector outputs |
|
| 114 |
+
| 16 | **Multi-scale detection** — Run detector at multiple scales (e.g. 600/800/1200 short side) and merge results | MED | +3-5% | FCOS/FPN literature |
|
| 115 |
+
|
| 116 |
+
### E. Post-Processing & Quality
|
| 117 |
+
|
| 118 |
+
| # | Idea | Effort | Expected Impact | Source |
|
| 119 |
+
|---|---|---|---|---|
|
| 120 |
+
| 17 | **Word-level confidence rejection** — Apply learned thresholds instead of hardcoded 0.3/0.35 | LOW | +3-5% | Manifest thresholds |
|
| 121 |
+
| 18 | **Batch recognizer inference** — Group crops by size, pad to same width, batch through ONNX | LOW (perf) | +0% (speed only) | DLL scheduling |
|
| 122 |
+
| 19 | **Implement TextlineBatcher** — DLL's `AVPipelineProto_TextlineImagesBatcher` groups textlines for efficient inference | LOW | +0% (speed only) | Manifest |
|
| 123 |
+
| 20 | **Score fusion** — Combine pixel scores, rejection model output, and confidence model into final weighted score | MED | +5-8% | Standard ensemble |
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## FAZA 3: Evaluation Matrix
|
| 128 |
+
|
| 129 |
+
### Dimensions: Impact × Effort × Confidence (that it will work)
|
| 130 |
+
|
| 131 |
+
| # | Idea | Impact (1-5) | Effort (1-5, 1=easy) | Confidence (1-5) | Score (I×C/E) | Priority |
|
| 132 |
+
|---|---|---|---|---|---|---|
|
| 133 |
+
| 6 | Rejection models integration | 5 | 2 | 4 | 10.0 | ⭐ **#1** |
|
| 134 |
+
| 3 | Per-level thresholds from manifest | 4 | 1 | 5 | 20.0 | ⭐ **#2** |
|
| 135 |
+
| 2 | Score calibration (Platt scaling) | 4 | 1 | 4 | 16.0 | ⭐ **#3** |
|
| 136 |
+
| 7 | Confidence models integration | 4 | 2 | 4 | 8.0 | ⭐ **#4** |
|
| 137 |
+
| 11 | LineLayout model integration | 5 | 3 | 3 | 5.0 | ⭐ **#5** |
|
| 138 |
+
| 4 | Oriented bbox regression | 4 | 3 | 3 | 4.0 | #6 |
|
| 139 |
+
| 8 | AuxMltCls script routing | 4 | 2 | 3 | 6.0 | #7 |
|
| 140 |
+
| 14 | Adaptive cropping / normalization | 4 | 3 | 3 | 4.0 | #8 |
|
| 141 |
+
| 15 | Proper vertical text handling | 3 | 1 | 4 | 12.0 | #9 |
|
| 142 |
+
| 17 | Word-level confidence rejection | 3 | 1 | 4 | 12.0 | #10 |
|
| 143 |
+
| 1 | Cross-layer SegLink linking | 5 | 5 | 2 | 2.0 | #11 |
|
| 144 |
+
| 20 | Score fusion | 3 | 3 | 2 | 2.0 | #12 |
|
| 145 |
+
| 10 | Adaptive CTC seq_lengths | 2 | 1 | 3 | 6.0 | #13 |
|
| 146 |
+
| 9 | Composite chars map | 2 | 1 | 3 | 6.0 | #14 |
|
| 147 |
+
| 16 | Multi-scale detection | 3 | 3 | 2 | 2.0 | #15 |
|
| 148 |
+
| 5 | Checkbox detector | 1 | 1 | 3 | 3.0 | #16 |
|
| 149 |
+
| 12 | Reading order | 3 | 4 | 2 | 1.5 | #17 |
|
| 150 |
+
| 13 | Region grouping | 3 | 4 | 2 | 1.5 | #18 |
|
| 151 |
+
| 18 | Batch inference | 0 | 2 | 5 | 0.0 | — |
|
| 152 |
+
| 19 | TextlineBatcher | 0 | 2 | 5 | 0.0 | — |
|
| 153 |
+
|
| 154 |
+
### Decision Strategy: "Quick Wins First"
|
| 155 |
+
|
| 156 |
+
**Top 5 by ROI (Impact × Confidence / Effort):**
|
| 157 |
+
1. Per-level thresholds from manifest (Score: 20.0) — trivial change, immediate improvement
|
| 158 |
+
2. Score calibration with Platt scaling (Score: 16.0) — parse files, apply transform
|
| 159 |
+
3. Proper vertical text handling (Score: 12.0) — use detector vertical outputs
|
| 160 |
+
4. Word-level confidence from manifest (Score: 12.0) — swap hardcoded thresholds
|
| 161 |
+
5. Rejection models integration (Score: 10.0) — biggest single impact, moderate effort
|
| 162 |
+
|
| 163 |
+
**Top 5 by absolute Impact:**
|
| 164 |
+
1. Rejection models (Impact: 5) — filter ~12% false positives
|
| 165 |
+
2. LineLayout model (Impact: 5) — fix ~12% line grouping errors
|
| 166 |
+
3. Cross-layer SegLink (Impact: 5) — fix ~10% detection misses
|
| 167 |
+
4. Score calibration (Impact: 4) — stabilize ~7% threshold sensitivity
|
| 168 |
+
5. Confidence models (Impact: 4) — improve ~8% quality filtering
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
## FAZA 4: Deep Dive — Top 3 Approaches
|
| 173 |
+
|
| 174 |
+
### Approach 1: "Rejection + Confidence Model Cascade" (Ideas #6, #7, #20)
|
| 175 |
+
|
| 176 |
+
#### What
|
| 177 |
+
Integrate the 22 unlocked models (11 rejection + 11 confidence) into the recognition pipeline as post-CTC verification steps.
|
| 178 |
+
|
| 179 |
+
#### How (step-by-step)
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
CURRENT: Detect → Crop → ScriptID → CTC → Output
|
| 183 |
+
PROPOSED: Detect → Crop → ScriptID → CTC → Rejection → Confidence → Output
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
**Step 1: Determine rejection model inputs**
|
| 187 |
+
From DLL RE, rejection models are in `Model_Edge/Rejection/` with names like `LatinPrintedV2Dummy`. The manifest maps them by script:
|
| 188 |
+
|
| 189 |
+
| Script | Rejection Model | Threshold (field 3) | Alt Threshold (field 4) |
|
| 190 |
+
|---|---|---|---|
|
| 191 |
+
| Latin Printed V2 | model_11 (or 12?) | 0.3516 | 0.0552 |
|
| 192 |
+
| Latin Mixed V2 | model_13 (or 14?) | 0.161 | 0.0881 |
|
| 193 |
+
| CJK Printed | model_15? | 0.3136 | 0.3136 |
|
| 194 |
+
| CJK Mixed | model_16? | 0.2548 | 0.2548 |
|
| 195 |
+
| Arabic Mixed | model_17? | 0.2911 | 0.2911 |
|
| 196 |
+
| Cyrillic Mixed | model_18? | 0.2088 | 0.2088 |
|
| 197 |
+
| Devanagari Mixed | model_19? | 0.228 | 0.228 |
|
| 198 |
+
| Greek Mixed | model_20? | 0.3124 | 0.3124 |
|
| 199 |
+
| Hebrew Printed | model_21? | 0.1042 | 0.1042 |
|
| 200 |
+
| Tamil Printed | model_22? | 0.0443 | 0.0443 |
|
| 201 |
+
| Thai Mixed | model_23? | 0.3371 | 0.3371 |
|
| 202 |
+
|
| 203 |
+
**Step 2: Probe model inputs/outputs**
|
| 204 |
+
```python
|
| 205 |
+
sess = ort.InferenceSession("model_11_*.onnx")
|
| 206 |
+
for inp in sess.get_inputs():
|
| 207 |
+
print(inp.name, inp.shape, inp.type)
|
| 208 |
+
for out in sess.get_outputs():
|
| 209 |
+
print(out.name, out.shape, out.type)
|
| 210 |
+
```
|
| 211 |
+
Expected: input = CTC logprobs / hidden states + image features; output = probability (scalar)
|
| 212 |
+
|
| 213 |
+
**Step 3: Pipeline integration**
|
| 214 |
+
```python
|
| 215 |
+
def _recognize_with_rejection(self, crop, model_idx):
|
| 216 |
+
text, conf, char_confs = self._recognize(crop, model_idx)
|
| 217 |
+
|
| 218 |
+
# Rejection check
|
| 219 |
+
rejection_score = self._run_rejection(crop, model_idx, text)
|
| 220 |
+
threshold = REJECTION_THRESHOLDS[script_name]
|
| 221 |
+
if rejection_score < threshold:
|
| 222 |
+
return "", 0.0, [] # rejected as noise
|
| 223 |
+
|
| 224 |
+
# Confidence calibration
|
| 225 |
+
calibrated_conf = self._run_confidence(crop, model_idx, text)
|
| 226 |
+
if calibrated_conf < 0.5: # manifest threshold
|
| 227 |
+
return "", 0.0, [] # low confidence
|
| 228 |
+
|
| 229 |
+
return text, calibrated_conf, char_confs
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
#### Expected impact: +12-15% match rate (53% → ~65-68%)
|
| 233 |
+
#### Risk: Model input/output schema unknown — need probing
|
| 234 |
+
#### Effort: 2-3 days
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
### Approach 2: "Detection Calibration Sprint" (Ideas #2, #3, #4, #15)
|
| 239 |
+
|
| 240 |
+
#### What
|
| 241 |
+
Apply ALL detection-side improvements from the decoded manifest: per-level thresholds, score calibration, oriented bbox, vertical text handling.
|
| 242 |
+
|
| 243 |
+
#### How (step-by-step)
|
| 244 |
+
|
| 245 |
+
**Step 1: Per-level pixel thresholds (from manifest field 9)**
|
| 246 |
+
```python
|
| 247 |
+
# CURRENT (flat threshold):
|
| 248 |
+
_PIXEL_SCORE_THRESH = 0.7
|
| 249 |
+
|
| 250 |
+
# PROPOSED (per-level from manifest):
|
| 251 |
+
_PIXEL_THRESH_PER_LEVEL = {
|
| 252 |
+
"fpn2": 0.7, # Field 9, P2 = 0.7
|
| 253 |
+
"fpn3": 0.8, # Field 9, P3 = 0.8
|
| 254 |
+
"fpn4": 0.8, # Field 9, P4 = 0.8
|
| 255 |
+
}
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
**Step 2: NMS and other thresholds from manifest**
|
| 259 |
+
```python
|
| 260 |
+
# Manifest decoded values:
|
| 261 |
+
_NMS_IOU_THRESH = 0.2 # Field 10 = 0.2 ✅ (already correct)
|
| 262 |
+
_CROSS_LINK_SCORE = 0.4 # Field 11 = 0.4
|
| 263 |
+
_MIN_TEXTLINE_SCORE = 0.8 # Field 13 = 0.8
|
| 264 |
+
_VERTICAL_CONF = 0.3 # Field 15 = 0.3
|
| 265 |
+
_HORIZONTAL_CONF = 0.5 # Field 16 = 0.5
|
| 266 |
+
_LINK_MERGE_THRESH = 0.4 # Field 17 = 0.4
|
| 267 |
+
_MIN_DIM_RATIO = 0.32 # Field 20 = 0.32
|
| 268 |
+
_ASPECT_RATIO_THRESH = 0.3 # Field 21 = 0.3
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
**Step 3: Apply score calibration (chunk_34/35)**
|
| 272 |
+
```python
|
| 273 |
+
# Parse calibration file
|
| 274 |
+
def load_calibration(path):
|
| 275 |
+
"""Load Platt scaling params: P(text|s) = σ(A*s + B)"""
|
| 276 |
+
# Format TBD — likely A, B values per FPN level or per output
|
| 277 |
+
with open(path) as f:
|
| 278 |
+
params = parse_calibration(f.read())
|
| 279 |
+
return params
|
| 280 |
+
|
| 281 |
+
# Apply in pixellink_decode:
|
| 282 |
+
pixel_scores = platt_scale(raw_pixel_scores, calibration_params)
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
+
**Step 4: Proper oriented bbox from PixelLink deltas**
|
| 286 |
+
Instead of reducing to axis-aligned rect, compute proper rotated bbox using mean corner positions:
|
| 287 |
+
```python
|
| 288 |
+
# CURRENT: Takes min/max of corners → axis-aligned rect
|
| 289 |
+
# PROPOSED: For each component, average corner positions properly
|
| 290 |
+
for idx in indices:
|
| 291 |
+
tl_positions.append([tl_x, tl_y])
|
| 292 |
+
tr_positions.append([tr_x, tr_y])
|
| 293 |
+
br_positions.append([br_x, br_y])
|
| 294 |
+
bl_positions.append([bl_x, bl_y])
|
| 295 |
+
|
| 296 |
+
# Use extreme points along the principal axis
|
| 297 |
+
# TL = min along (x+y), TR = min along (-x+y), etc.
|
| 298 |
+
quad = compute_oriented_quad(tl_positions, tr_positions, br_positions, bl_positions)
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
**Step 5: Use vertical FPN outputs properly**
|
| 302 |
+
```python
|
| 303 |
+
# CURRENT: heuristic h > 2*w after detection
|
| 304 |
+
# PROPOSED: Use detector's vert outputs as primary for vertical text
|
| 305 |
+
for level, stride in [("fpn3", 8), ("fpn4", 16)]:
|
| 306 |
+
# Process horizontal AND vertical separately
|
| 307 |
+
hori_quads = decode(out_dict[f"scores_hori_{level}"], ...)
|
| 308 |
+
vert_quads = decode(out_dict[f"scores_vert_{level}"], ...)
|
| 309 |
+
# Tag vertical quads for rotation before recognition
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
#### Expected impact: +10-15% match rate (53% → ~63-68%)
|
| 313 |
+
#### Risk: Score calibration file format unknown
|
| 314 |
+
#### Effort: 2-3 days
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
### Approach 3: "LineLayout + Script Routing" (Ideas #8, #11)
|
| 319 |
+
|
| 320 |
+
#### What
|
| 321 |
+
Replace heuristic line grouping with ML model (model_33) and improve script routing with AuxMltCls (model_34).
|
| 322 |
+
|
| 323 |
+
#### How
|
| 324 |
+
|
| 325 |
+
**Step 1: Probe LineLayout model (model_33)**
|
| 326 |
+
```python
|
| 327 |
+
sess = ort.InferenceSession("model_33_*.onnx")
|
| 328 |
+
# Expected: input = text crop image; output = line boundary features
|
| 329 |
+
# DLL uses it for:
|
| 330 |
+
# - Deciding where to split/merge detected regions into lines
|
| 331 |
+
# - CJK specific: line_gap=2.85, line_merge=3.1 (manifest field 13)
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
**Step 2: AuxMltCls integration (model_34)**
|
| 335 |
+
From manifest (field 20):
|
| 336 |
+
- Purpose: Script + handwriting detection (multi-class)
|
| 337 |
+
- Thresholds: 4.1 (printed), -2.0 (handwritten), -5.0/5.0 (range)
|
| 338 |
+
- Per-script consecutive frames: Devanagari=2, Tamil=3, Thai=3, CJK=3, Cyrillic=1, Greek=3, Hebrew=3
|
| 339 |
+
- Handwritten calibration map file available
|
| 340 |
+
|
| 341 |
+
```python
|
| 342 |
+
def _classify_script_enhanced(self, crop):
|
| 343 |
+
# Step 1: Run AuxMltCls
|
| 344 |
+
aux_scores = self._run_aux_mlt_cls(crop)
|
| 345 |
+
|
| 346 |
+
# Step 2: Determine printed/handwritten
|
| 347 |
+
is_handwritten = aux_scores[handwriting_idx] > -2.0
|
| 348 |
+
|
| 349 |
+
# Step 3: Select per-script recognizer
|
| 350 |
+
# Use different model for handwritten CJK vs printed CJK
|
| 351 |
+
if is_handwritten and script == "CJK":
|
| 352 |
+
model_idx = CJK_MIXED_MODEL # instead of CJK_PRINTED
|
| 353 |
+
|
| 354 |
+
return model_idx
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
**Step 3: LineLayout model for line grouping**
|
| 358 |
+
```python
|
| 359 |
+
def _group_with_line_layout(self, quads, img_rgb):
|
| 360 |
+
sess = self._get_line_layout()
|
| 361 |
+
|
| 362 |
+
# For each pair of adjacent quads, predict if they belong to same line
|
| 363 |
+
for i, j in adjacent_pairs(quads):
|
| 364 |
+
# Crop region spanning both quads
|
| 365 |
+
combined_crop = self._crop_pair(img_rgb, quads[i], quads[j])
|
| 366 |
+
score = sess.run(None, {"data": preprocess(combined_crop)})[1]
|
| 367 |
+
|
| 368 |
+
if score > LINE_MERGE_THRESHOLD: # CJK: 2.85 → 3.1
|
| 369 |
+
merge(i, j)
|
| 370 |
+
|
| 371 |
+
return grouped_lines
|
| 372 |
+
```
|
| 373 |
+
|
| 374 |
+
#### Expected impact: +8-12% match rate (53% → ~61-65%)
|
| 375 |
+
#### Risk: Model I/O unknown, interpretation of scores TBD
|
| 376 |
+
#### Effort: 3-4 days
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
## FAZA 5: Good/Bad Analysis
|
| 381 |
+
|
| 382 |
+
### Approach 1: Rejection + Confidence Cascade
|
| 383 |
+
|
| 384 |
+
| ✅ Good | ❌ Bad |
|
| 385 |
+
|---|---|
|
| 386 |
+
| Biggest single impact area (~12-15%) | Need to discover model input schema |
|
| 387 |
+
| Models already unlocked and ready | Script-to-model mapping uncertain (need probing) |
|
| 388 |
+
| Clear thresholds from manifest | May need recognizer hidden states (not just CTC output) |
|
| 389 |
+
| Standard pattern in OCR literature | 22 additional models = memory overhead |
|
| 390 |
+
| Directly addresses false-positive problem | Two thresholds (rejection + confidence) may interact poorly |
|
| 391 |
+
|
| 392 |
+
### Approach 2: Detection Calibration Sprint
|
| 393 |
+
|
| 394 |
+
| ✅ Good | ❌ Bad |
|
| 395 |
+
|---|---|
|
| 396 |
+
| Several trivial "flip the switch" changes | Score calibration file format unknown |
|
| 397 |
+
| Data directly from decoded manifest | Oriented bbox changes may break downstream crop |
|
| 398 |
+
| Low risk — incremental, testable change | Per-level thresholds may not compose well |
|
| 399 |
+
| Addresses root cause (detection quality) | Vertical text handling rearchitecture needed |
|
| 400 |
+
| Well-documented in papers (PixelLink, SegLink) | DLL may use SegLink (not PixelLink) internally |
|
| 401 |
+
|
| 402 |
+
### Approach 3: LineLayout + Script Routing
|
| 403 |
+
|
| 404 |
+
| ✅ Good | ❌ Bad |
|
| 405 |
+
|---|---|
|
| 406 |
+
| ML-based replaces heuristic — guaranteed better | LineLayout model I/O completely unknown |
|
| 407 |
+
| AuxMltCls has clear manifest config | Handwritten vs printed routing adds complexity |
|
| 408 |
+
| Fixes systematic line-grouping errors | Only benefits multi-line images |
|
| 409 |
+
| CJK-specific params available | DLL may pass internal state to LineLayout, not image |
|
| 410 |
+
| Enables proper reading order | Highest effort of the 3 approaches |
|
| 411 |
+
|
| 412 |
+
---
|
| 413 |
+
|
| 414 |
+
## FAZA 6: Final Recommendation
|
| 415 |
+
|
| 416 |
+
### Recommended Strategy: "Three Sprints" (all three approaches, prioritized)
|
| 417 |
+
|
| 418 |
+
**Combining all three approaches is NECESSARY to reach ≥95%.** No single approach alone
|
| 419 |
+
can close the 47-point gap. The approaches are orthogonal and composable.
|
| 420 |
+
|
| 421 |
+
### Execution Plan
|
| 422 |
+
|
| 423 |
+
#### Sprint 1 — "Quick Wins" (1-2 days) → Expected: 53% → ~65%
|
| 424 |
+
1. ✅ Apply per-level thresholds from manifest (P2=0.7, P3=0.8, P4=0.8)
|
| 425 |
+
2. ✅ Fix NMS and linking thresholds from manifest (cross_link=0.4, min_textline=0.8)
|
| 426 |
+
3. ✅ Use manifest confidence thresholds instead of hardcoded 0.3/0.35
|
| 427 |
+
4. ✅ Use detector vertical outputs properly (already in output dict)
|
| 428 |
+
5. ✅ Parse and apply score calibration files (chunk_34/35)
|
| 429 |
+
|
| 430 |
+
#### Sprint 2 — "Rejection Pipeline" (2-3 days) → Expected: ~65% → ~78%
|
| 431 |
+
1. Probe all models 11-21 (rejection) — determine input/output schema
|
| 432 |
+
2. Map rejection models to scripts using manifest field 7
|
| 433 |
+
3. Implement rejection cascade after CTC decode
|
| 434 |
+
4. Probe all models 22-32 (confidence) — determine input/output schema
|
| 435 |
+
5. Implement confidence calibration after rejection
|
| 436 |
+
6. Apply composite_chars_map for Cyrillic and Hebrew
|
| 437 |
+
|
| 438 |
+
#### Sprint 3 — "ML Line Grouping" (3-4 days) → Expected: ~78% → ~90%
|
| 439 |
+
1. Probe LineLayout model (model_33) — determine I/O schema
|
| 440 |
+
2. Implement ML-based line grouping using model_33
|
| 441 |
+
3. Probe AuxMltCls model (model_34) — determine I/O schema
|
| 442 |
+
4. Implement enhanced script routing with handwritten detection
|
| 443 |
+
5. Implement proper oriented bbox from corner regression
|
| 444 |
+
6. Cross-layer FPN linking (SegLink-style, if time permits)
|
| 445 |
+
|
| 446 |
+
### Post-Sprint: "Fine-Tuning" → Expected: ~90% → ~95%+
|
| 447 |
+
- A/B test every threshold against DLL output
|
| 448 |
+
- Implement SegLink cross-layer linking if detection gaps remain
|
| 449 |
+
- Reading order optimization for complex layouts
|
| 450 |
+
- Performance optimization (batching, session caching)
|
| 451 |
+
|
| 452 |
+
### Key Research References
|
| 453 |
+
|
| 454 |
+
1. **SegLink** — Shi et al. (2017), CVPR, arXiv:1703.06520
|
| 455 |
+
- Segments + Links + Cross-layer connections → combined oriented boxes
|
| 456 |
+
- Post-processing: DFS connected components → linear regression merge
|
| 457 |
+
|
| 458 |
+
2. **PixelLink** — Deng et al. (2018), AAAI, arXiv:1801.01315
|
| 459 |
+
- Pixel classification + 8-neighbor link → Union-Find → minAreaRect
|
| 460 |
+
- NO regression — pure segmentation approach
|
| 461 |
+
|
| 462 |
+
3. **CRNN** — Shi et al. (2015), arXiv:1507.05717
|
| 463 |
+
- CNN + BiLSTM + CTC — foundation of OneOCR's recognizers (models 2-10)
|
| 464 |
+
|
| 465 |
+
4. **On Calibration of Modern Neural Networks** — Guo et al. (2017), ICML
|
| 466 |
+
- Temperature scaling, Platt scaling for confidence calibration
|
| 467 |
+
|
| 468 |
+
5. **OneOCR Manifest** — Decoded protobuf (internal, extracted from `.onemodel`)
|
| 469 |
+
- Complete config: thresholds per FPN level, rejection/confidence thresholds, CJK line layout params
|
| 470 |
+
|
| 471 |
+
### Risk Assessment
|
| 472 |
+
|
| 473 |
+
| Risk | Probability | Impact | Mitigation |
|
| 474 |
+
|---|---|---|---|
|
| 475 |
+
| Rejection model inputs incompatible | Medium | High | Probe models first, fall back to confidence-only |
|
| 476 |
+
| Score calibration format unreadable | Low | Medium | Try common formats (CSV, binary float, protobuf) |
|
| 477 |
+
| LineLayout needs DLL internal state | Medium | High | Fall back to improved heuristic with ML scoring |
|
| 478 |
+
| Cross-layer SegLink too complex | High | Medium | Skip if Quick Wins + Rejection get us to ~80% |
|
| 479 |
+
| Models 11-34 need features not in ONNX | Low | High | Those features ARE in ONNX outputs (just need mapping) |
|
| 480 |
+
|
| 481 |
+
### Honest Assessment
|
| 482 |
+
|
| 483 |
+
**Can we reach 100%?** Probably not without the exact DLL source code.
|
| 484 |
+
|
| 485 |
+
**Can we reach 95%?** YES — with all three sprints executed. The gap is primarily from:
|
| 486 |
+
- Missing rejection filtering (easily fixed with unlocked models)
|
| 487 |
+
- Wrong detection thresholds (trivially fixed from manifest)
|
| 488 |
+
- Heuristic line grouping (fixable with model 33)
|
| 489 |
+
|
| 490 |
+
**What blocks us?** The biggest unknown is the rejection model I/O schema. If those models expect
|
| 491 |
+
internal DLL tensor states that we can't provide, we'll plateau around 75-80%.
|
| 492 |
+
|
| 493 |
+
**Is Microsoft's published research sufficient?** YES for the algorithmic concepts (PixelLink,
|
| 494 |
+
SegLink, CRNN, score calibration). The extracted manifest + config files fill in the
|
| 495 |
+
implementation-specific gaps (thresholds, calibration params, model routing).
|
| 496 |
+
|
| 497 |
+
---
|
| 498 |
+
|
| 499 |
+
## Manifest Threshold Quick Reference
|
| 500 |
+
|
| 501 |
+
```
|
| 502 |
+
DETECTOR:
|
| 503 |
+
pixel_threshold_global: 0.7 (field 8)
|
| 504 |
+
pixel_threshold_P2: 0.7 (field 9[0])
|
| 505 |
+
pixel_threshold_P3: 0.8 (field 9[1])
|
| 506 |
+
pixel_threshold_P4: 0.8 (field 9[2])
|
| 507 |
+
nms_iou_threshold: 0.2 (field 10)
|
| 508 |
+
cross_link_score: 0.4 (field 11)
|
| 509 |
+
min_textline_score: 0.8 (field 13)
|
| 510 |
+
vertical_conf: 0.3 (field 15)
|
| 511 |
+
horizontal_conf: 0.5 (field 16)
|
| 512 |
+
link_merge_threshold: 0.4 (field 17)
|
| 513 |
+
min_dim_ratio: 0.32 (field 20)
|
| 514 |
+
aspect_ratio_thresh: 0.3 (field 21)
|
| 515 |
+
|
| 516 |
+
REJECTION (per script):
|
| 517 |
+
LatinPrintedV2: 0.3516 / 0.0552
|
| 518 |
+
LatinMixedV2: 0.161 / 0.0881
|
| 519 |
+
CJKPrinted: 0.3136
|
| 520 |
+
CJKMixed: 0.2548
|
| 521 |
+
ArabicMixed: 0.2911
|
| 522 |
+
CyrillicMixed: 0.2088
|
| 523 |
+
DevanagariMixed: 0.228
|
| 524 |
+
GreekMixed: 0.3124
|
| 525 |
+
HebrewPrinted: 0.1042
|
| 526 |
+
TamilPrinted: 0.0443
|
| 527 |
+
ThaiMixed: 0.3371
|
| 528 |
+
|
| 529 |
+
CONFIDENCE (all scripts):
|
| 530 |
+
threshold: 0.5
|
| 531 |
+
|
| 532 |
+
LINELAYOUT (CJK):
|
| 533 |
+
line_gap: 2.85
|
| 534 |
+
line_merge: 3.1
|
| 535 |
+
|
| 536 |
+
AUXMLTCLS:
|
| 537 |
+
printed_threshold: 4.1
|
| 538 |
+
handwritten_threshold: -2.0
|
| 539 |
+
score_range: [-5.0, 5.0]
|
| 540 |
+
```
|
BRAINSTORM_ONEOCR_ACCURACY_SUMMARY.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BRAINSTORM SUMMARY: OneOCR Accuracy Gap (53% → 95%+)
|
| 2 |
+
|
| 3 |
+
## Problem
|
| 4 |
+
ONNX pipeline matches DLL output on only 53% of test images. Root causes:
|
| 5 |
+
no rejection filtering, flat detection thresholds, heuristic line grouping.
|
| 6 |
+
|
| 7 |
+
## Top 3 Approaches (in order)
|
| 8 |
+
|
| 9 |
+
### 1. Quick Wins — Detection Calibration (1-2 days, +12%)
|
| 10 |
+
- Apply per-FPN-level pixel thresholds from manifest (P2=0.7, P3=0.8, P4=0.8)
|
| 11 |
+
- Use manifest NMS/linking thresholds instead of hardcoded values
|
| 12 |
+
- Parse score calibration files (chunk_34/35) → Platt scaling
|
| 13 |
+
- Use vertical FPN outputs instead of h>2w heuristic
|
| 14 |
+
- Apply manifest confidence thresholds
|
| 15 |
+
|
| 16 |
+
### 2. Rejection + Confidence Pipeline (2-3 days, +13%)
|
| 17 |
+
- Integrate 11 rejection models (11-21) after CTC decode
|
| 18 |
+
- Per-script rejection thresholds from manifest (e.g. Latin=0.161, CJK=0.2548)
|
| 19 |
+
- Integrate 11 confidence models (22-32) with threshold=0.5
|
| 20 |
+
- Apply composite_chars_map for Cyrillic/Hebrew
|
| 21 |
+
- This alone addresses ~80% of false-positive gap
|
| 22 |
+
|
| 23 |
+
### 3. ML Line Grouping + Script Routing (3-4 days, +12%)
|
| 24 |
+
- Replace Y-overlap heuristic with LineLayout model (model_33)
|
| 25 |
+
- Use AuxMltCls model (model_34) for printed/handwritten routing
|
| 26 |
+
- Implement proper oriented bbox from corner regression
|
| 27 |
+
- CJK-specific: line_gap=2.85, line_merge=3.1
|
| 28 |
+
|
| 29 |
+
## Expected Trajectory
|
| 30 |
+
```
|
| 31 |
+
Sprint 1 (Quick Wins): 53% → ~65%
|
| 32 |
+
Sprint 2 (Rejection): 65% → ~78%
|
| 33 |
+
Sprint 3 (Line Layout): 78% → ~90%
|
| 34 |
+
Fine-tuning: 90% → ~95%+
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Key Risk
|
| 38 |
+
Rejection models (11-21) may expect internal DLL tensor states we can't provide.
|
| 39 |
+
**Mitigation:** Probe model inputs first — most likely they take image crop + CTC logprobs.
|
| 40 |
+
|
| 41 |
+
## Critical Data Sources
|
| 42 |
+
- Manifest protobuf (`15_manifest_decoded.txt`) — ALL thresholds
|
| 43 |
+
- Score calibration files (`chunk_34/35`) — Platt scaling
|
| 44 |
+
- SegLink paper (Shi 2017) — cross-layer linking algorithm
|
| 45 |
+
- PixelLink paper (Deng 2018) — Union-Find decoder reference
|
| 46 |
+
|
| 47 |
+
## Bottom Line
|
| 48 |
+
**95% is achievable** with ~7-9 days of focused work across 3 sprints.
|
| 49 |
+
100% match unlikely without DLL source code, but remaining gap would be
|
| 50 |
+
edge cases (curved text, exotic layouts).
|
Dockerfile
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ─────────────────────────────────────────────────────
|
| 2 |
+
# OneOCR on Linux — Dockerfile
|
| 3 |
+
#
|
| 4 |
+
# Uses Wine to run the native Windows DLL on Linux.
|
| 5 |
+
# Result: 100% accuracy (identical to Windows DLL).
|
| 6 |
+
#
|
| 7 |
+
# Build:
|
| 8 |
+
# docker build -t oneocr .
|
| 9 |
+
#
|
| 10 |
+
# Run OCR on a single image:
|
| 11 |
+
# docker run --rm -v $(pwd)/working_space:/data oneocr \
|
| 12 |
+
# python main.py --image /data/input/test.png --output /data/output/
|
| 13 |
+
#
|
| 14 |
+
# Interactive:
|
| 15 |
+
# docker run --rm -it -v $(pwd)/working_space:/data oneocr bash
|
| 16 |
+
# ─────────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
FROM ubuntu:24.04
|
| 19 |
+
|
| 20 |
+
LABEL maintainer="MattyMroz"
|
| 21 |
+
LABEL description="OneOCR — Windows DLL on Linux via Wine (100% accuracy)"
|
| 22 |
+
|
| 23 |
+
# Avoid interactive prompts
|
| 24 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 25 |
+
ENV WINEDEBUG=-all
|
| 26 |
+
|
| 27 |
+
# ── 1. Install Wine + MinGW cross-compiler ─────────
|
| 28 |
+
RUN dpkg --add-architecture amd64 && \
|
| 29 |
+
apt-get update && \
|
| 30 |
+
apt-get install -y --no-install-recommends \
|
| 31 |
+
wine64 \
|
| 32 |
+
wine \
|
| 33 |
+
mingw-w64 \
|
| 34 |
+
python3 \
|
| 35 |
+
python3-pip \
|
| 36 |
+
python3-venv \
|
| 37 |
+
python3-dev \
|
| 38 |
+
&& \
|
| 39 |
+
apt-get clean && \
|
| 40 |
+
rm -rf /var/lib/apt/lists/*
|
| 41 |
+
|
| 42 |
+
# ── 2. Initialize Wine prefix (64-bit) ────────────
|
| 43 |
+
RUN WINEPREFIX=/root/.wine WINEARCH=win64 wineboot --init 2>/dev/null; \
|
| 44 |
+
sleep 2
|
| 45 |
+
|
| 46 |
+
# ── 3. Copy project ───────────────────────────────
|
| 47 |
+
WORKDIR /app
|
| 48 |
+
COPY . /app/
|
| 49 |
+
|
| 50 |
+
# ── 4. Install Python dependencies ────────────────
|
| 51 |
+
RUN python3 -m venv /app/.venv && \
|
| 52 |
+
/app/.venv/bin/pip install --no-cache-dir \
|
| 53 |
+
pillow \
|
| 54 |
+
numpy \
|
| 55 |
+
onnxruntime
|
| 56 |
+
|
| 57 |
+
# ── 5. Cross-compile Wine loader ──────────────────
|
| 58 |
+
RUN x86_64-w64-mingw32-gcc -O2 \
|
| 59 |
+
-o /app/tools/oneocr_loader.exe \
|
| 60 |
+
/app/tools/oneocr_loader.c \
|
| 61 |
+
|| echo "Will compile on first run"
|
| 62 |
+
|
| 63 |
+
# ── 6. Write the C source for compilation ─────────
|
| 64 |
+
RUN /app/.venv/bin/python -c "\
|
| 65 |
+
from tools.wine_bridge import WINE_LOADER_C; \
|
| 66 |
+
from pathlib import Path; \
|
| 67 |
+
Path('/app/tools/oneocr_loader.c').write_text(WINE_LOADER_C)" && \
|
| 68 |
+
x86_64-w64-mingw32-gcc -O2 \
|
| 69 |
+
-o /app/tools/oneocr_loader.exe \
|
| 70 |
+
/app/tools/oneocr_loader.c \
|
| 71 |
+
2>/dev/null || true
|
| 72 |
+
|
| 73 |
+
# ── 7. Environment ────────────────────────────────
|
| 74 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 75 |
+
ENV PYTHONPATH="/app"
|
| 76 |
+
|
| 77 |
+
# ── 8. Healthcheck ────────────────────────────────
|
| 78 |
+
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
|
| 79 |
+
CMD python3 -c "from tools.wine_bridge import WineBridge; \
|
| 80 |
+
b = WineBridge(); c = b.check_requirements(); \
|
| 81 |
+
exit(0 if c.get('wine_found') else 1)"
|
| 82 |
+
|
| 83 |
+
# ── Default command ───────────────────────────────
|
| 84 |
+
CMD ["python3", "main.py", "--help"]
|
README.md
CHANGED
|
@@ -13,7 +13,8 @@ Full reimplementation of Microsoft's OneOCR engine from Windows Snipping Tool.
|
|
| 13 |
| **Model extraction** | ✅ Done | 34 ONNX models, 33 config files |
|
| 14 |
| **Custom op unlocking** | ✅ Done | `OneOCRFeatureExtract` → `Gemm`/`Conv1x1` |
|
| 15 |
| **ONNX pipeline** | ⚠️ Partial | **53% match rate** vs DLL (10/19 test images) |
|
| 16 |
-
| **DLL pipeline** | ✅ Done | ctypes wrapper,
|
|
|
|
| 17 |
|
| 18 |
### Known ONNX Engine Limitations
|
| 19 |
|
|
@@ -122,15 +123,16 @@ python tools/extract_pipeline.py --verify-only
|
|
| 122 |
### Usage
|
| 123 |
|
| 124 |
```python
|
| 125 |
-
|
|
|
|
| 126 |
from PIL import Image
|
| 127 |
|
| 128 |
-
engine =
|
| 129 |
result = engine.recognize_pil(Image.open("screenshot.png"))
|
| 130 |
|
| 131 |
-
print(
|
| 132 |
-
print(result.
|
| 133 |
-
print(result.
|
| 134 |
|
| 135 |
for line in result.lines:
|
| 136 |
for word in line.words:
|
|
@@ -138,6 +140,26 @@ for line in result.lines:
|
|
| 138 |
f"bbox=({word.bounding_rect.x1:.0f},{word.bounding_rect.y1:.0f})")
|
| 139 |
```
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
### API Reference
|
| 142 |
|
| 143 |
```python
|
|
@@ -167,25 +189,90 @@ word.bounding_rect # BoundingRect (x1,y1...x4,y4 quadrilateral)
|
|
| 167 |
|
| 168 |
---
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
## Project Structure
|
| 171 |
|
| 172 |
```
|
| 173 |
ONEOCR/
|
| 174 |
-
├── main.py #
|
|
|
|
| 175 |
├── pyproject.toml # Project config & dependencies
|
| 176 |
├── README.md # This documentation
|
| 177 |
├── .gitignore
|
| 178 |
│
|
| 179 |
├── ocr/ # Core OCR package
|
| 180 |
-
│ ├── __init__.py # Exports
|
| 181 |
│ ├── engine.py # DLL wrapper (Windows only, 374 lines)
|
| 182 |
│ ├── engine_onnx.py # ONNX engine (cross-platform, ~1100 lines)
|
|
|
|
| 183 |
│ └── models.py # Data models: OcrResult, OcrLine, OcrWord
|
| 184 |
│
|
| 185 |
├── tools/ # Utilities
|
| 186 |
│ ├── extract_pipeline.py # Extraction pipeline (decrypt→extract→unlock→verify)
|
| 187 |
│ ├── visualize_ocr.py # OCR result visualization with bounding boxes
|
| 188 |
-
│
|
|
|
|
|
|
|
| 189 |
│
|
| 190 |
├── ocr_data/ # Runtime data (DO NOT commit)
|
| 191 |
│ ├── oneocr.dll # Original DLL (Windows only)
|
|
|
|
| 13 |
| **Model extraction** | ✅ Done | 34 ONNX models, 33 config files |
|
| 14 |
| **Custom op unlocking** | ✅ Done | `OneOCRFeatureExtract` → `Gemm`/`Conv1x1` |
|
| 15 |
| **ONNX pipeline** | ⚠️ Partial | **53% match rate** vs DLL (10/19 test images) |
|
| 16 |
+
| **DLL pipeline (Windows)** | ✅ Done | ctypes wrapper, 100% accuracy |
|
| 17 |
+
| **DLL pipeline (Linux)** | ✅ Done | Wine bridge, 100% accuracy, Docker ready |
|
| 18 |
|
| 19 |
### Known ONNX Engine Limitations
|
| 20 |
|
|
|
|
| 123 |
### Usage
|
| 124 |
|
| 125 |
```python
|
| 126 |
+
# Recommended: Unified engine (auto-selects best backend)
|
| 127 |
+
from ocr.engine_unified import OcrEngineUnified
|
| 128 |
from PIL import Image
|
| 129 |
|
| 130 |
+
engine = OcrEngineUnified() # auto: DLL → Wine → ONNX
|
| 131 |
result = engine.recognize_pil(Image.open("screenshot.png"))
|
| 132 |
|
| 133 |
+
print(f"Backend: {engine.backend_name}") # "dll" / "wine" / "onnx"
|
| 134 |
+
print(result.text) # "Hello World"
|
| 135 |
+
print(result.average_confidence) # 0.975
|
| 136 |
|
| 137 |
for line in result.lines:
|
| 138 |
for word in line.words:
|
|
|
|
| 140 |
f"bbox=({word.bounding_rect.x1:.0f},{word.bounding_rect.y1:.0f})")
|
| 141 |
```
|
| 142 |
|
| 143 |
+
```bash
|
| 144 |
+
# CLI:
|
| 145 |
+
python main.py screenshot.png # auto backend
|
| 146 |
+
python main.py screenshot.png --backend dll # force DLL (Windows)
|
| 147 |
+
python main.py screenshot.png --backend wine # force Wine (Linux)
|
| 148 |
+
python main.py screenshot.png --backend onnx # force ONNX (any OS)
|
| 149 |
+
python main.py screenshot.png -o result.json # save JSON output
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### ONNX Engine (alternative — cross-platform, no Wine needed)
|
| 153 |
+
|
| 154 |
+
```python
|
| 155 |
+
from ocr.engine_onnx import OcrEngineOnnx
|
| 156 |
+
from PIL import Image
|
| 157 |
+
|
| 158 |
+
engine = OcrEngineOnnx()
|
| 159 |
+
result = engine.recognize_pil(Image.open("screenshot.png"))
|
| 160 |
+
print(result.text)
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
### API Reference
|
| 164 |
|
| 165 |
```python
|
|
|
|
| 189 |
|
| 190 |
---
|
| 191 |
|
| 192 |
+
## Running on Linux (Wine Bridge — 100% accuracy)
|
| 193 |
+
|
| 194 |
+
The DLL has a remarkably clean dependency profile (only `KERNEL32`, `bcrypt`, `dbghelp` + shipped `onnxruntime.dll`), making it fully compatible with Wine.
|
| 195 |
+
|
| 196 |
+
### Option A: Docker (recommended)
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
# Build
|
| 200 |
+
docker build -t oneocr .
|
| 201 |
+
|
| 202 |
+
# Run OCR on an image
|
| 203 |
+
docker run --rm -v $(pwd)/working_space:/data oneocr \
|
| 204 |
+
python main.py /data/input/test.png --output /data/output/result.json
|
| 205 |
+
|
| 206 |
+
# Interactive shell
|
| 207 |
+
docker run --rm -it -v $(pwd)/working_space:/data oneocr bash
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
### Option B: Native Wine
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
# 1. Install Wine + MinGW cross-compiler
|
| 214 |
+
# Ubuntu/Debian:
|
| 215 |
+
sudo apt install wine64 mingw-w64
|
| 216 |
+
|
| 217 |
+
# Fedora:
|
| 218 |
+
sudo dnf install wine mingw64-gcc
|
| 219 |
+
|
| 220 |
+
# Arch:
|
| 221 |
+
sudo pacman -S wine mingw-w64-gcc
|
| 222 |
+
|
| 223 |
+
# 2. Initialize 64-bit Wine prefix
|
| 224 |
+
WINEARCH=win64 wineboot --init
|
| 225 |
+
|
| 226 |
+
# 3. Compile the Wine loader (one-time)
|
| 227 |
+
x86_64-w64-mingw32-gcc -O2 -o tools/oneocr_loader.exe tools/oneocr_loader.c
|
| 228 |
+
|
| 229 |
+
# 4. Test
|
| 230 |
+
python main.py screenshot.png --backend wine
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Wine Bridge Architecture
|
| 234 |
+
|
| 235 |
+
```
|
| 236 |
+
Linux Python ──► subprocess (wine64) ──► oneocr_loader.exe ──► oneocr.dll
|
| 237 |
+
▲ │
|
| 238 |
+
│ ▼
|
| 239 |
+
└──── JSON stdout ◄──── OCR results ◄──── onnxruntime.dll
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
**DLL Dependencies (all implemented in Wine ≥ 8.0):**
|
| 243 |
+
|
| 244 |
+
| DLL | Functions | Wine Status | Notes |
|
| 245 |
+
|-----|-----------|-------------|-------|
|
| 246 |
+
| `KERNEL32.dll` | 183 | ✅ Full | Standard WinAPI |
|
| 247 |
+
| `bcrypt.dll` | 12 | ✅ Full | AES-256-CFB128 for model decryption |
|
| 248 |
+
| `dbghelp.dll` | 5 | ✅ Stubs | Debug symbols — non-critical |
|
| 249 |
+
| `onnxruntime.dll` | 1 | N/A | Shipped with package |
|
| 250 |
+
|
| 251 |
+
---
|
| 252 |
+
|
| 253 |
## Project Structure
|
| 254 |
|
| 255 |
```
|
| 256 |
ONEOCR/
|
| 257 |
+
├── main.py # CLI entry point (auto-selects backend)
|
| 258 |
+
├── Dockerfile # Docker setup for Linux (Wine + DLL)
|
| 259 |
├── pyproject.toml # Project config & dependencies
|
| 260 |
├── README.md # This documentation
|
| 261 |
├── .gitignore
|
| 262 |
│
|
| 263 |
├── ocr/ # Core OCR package
|
| 264 |
+
│ ├── __init__.py # Exports all engines & models
|
| 265 |
│ ├── engine.py # DLL wrapper (Windows only, 374 lines)
|
| 266 |
│ ├── engine_onnx.py # ONNX engine (cross-platform, ~1100 lines)
|
| 267 |
+
│ ├── engine_unified.py # Unified wrapper (DLL → Wine → ONNX)
|
| 268 |
│ └── models.py # Data models: OcrResult, OcrLine, OcrWord
|
| 269 |
│
|
| 270 |
├── tools/ # Utilities
|
| 271 |
│ ├── extract_pipeline.py # Extraction pipeline (decrypt→extract→unlock→verify)
|
| 272 |
│ ├── visualize_ocr.py # OCR result visualization with bounding boxes
|
| 273 |
+
│ ├── test_quick.py # Quick OCR test on images
|
| 274 |
+
│ ├── wine_bridge.py # Wine bridge for Linux (C loader + Python API)
|
| 275 |
+
│ └── oneocr_loader.c # C source for Wine loader (auto-generated)
|
| 276 |
│
|
| 277 |
├── ocr_data/ # Runtime data (DO NOT commit)
|
| 278 |
│ ├── oneocr.dll # Original DLL (Windows only)
|
main.py
CHANGED
|
@@ -1,65 +1,82 @@
|
|
| 1 |
"""
|
| 2 |
-
OneOCR — Cross-platform OCR using
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
Usage:
|
| 9 |
python main.py <image_path>
|
| 10 |
python main.py # uses test.png
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
|
|
|
| 13 |
import sys
|
| 14 |
from pathlib import Path
|
| 15 |
from PIL import Image
|
| 16 |
|
| 17 |
|
| 18 |
def main():
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
if not Path(
|
| 22 |
-
print(f"Image not found: {
|
| 23 |
print(f"Usage: python main.py <image_path>")
|
| 24 |
sys.exit(1)
|
| 25 |
|
| 26 |
-
img = Image.open(
|
| 27 |
-
print(f"Image: {
|
| 28 |
print()
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
from ocr.engine_onnx import OcrEngineOnnx
|
| 33 |
-
engine = OcrEngineOnnx()
|
| 34 |
-
result = engine.recognize_pil(img)
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
print(f"Angle: {result.text_angle:.1f}")
|
| 40 |
-
print()
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
print(f"ONNX engine error: {e}")
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
+
OneOCR — Cross-platform OCR using Microsoft OneOCR engine.
|
| 3 |
|
| 4 |
+
Available backends (auto-selected):
|
| 5 |
+
1. OcrEngine: Windows-only DLL wrapper (100% accuracy, fastest)
|
| 6 |
+
2. OcrEngineUnified: Auto-selects best backend (DLL → Wine → ONNX)
|
| 7 |
+
3. OcrEngineOnnx: Cross-platform ONNX reimplementation (~53% match rate)
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python main.py <image_path>
|
| 11 |
python main.py # uses test.png
|
| 12 |
+
python main.py --backend dll # force DLL backend
|
| 13 |
+
python main.py --backend wine # force Wine backend (Linux)
|
| 14 |
+
python main.py --backend onnx # force ONNX backend
|
| 15 |
"""
|
| 16 |
|
| 17 |
+
import argparse
|
| 18 |
import sys
|
| 19 |
from pathlib import Path
|
| 20 |
from PIL import Image
|
| 21 |
|
| 22 |
|
| 23 |
def main():
|
| 24 |
+
parser = argparse.ArgumentParser(description="OneOCR — Cross-platform OCR")
|
| 25 |
+
parser.add_argument("image", nargs="?", default="test.png", help="Image path")
|
| 26 |
+
parser.add_argument("--backend", "-b", choices=["dll", "wine", "onnx", "auto"],
|
| 27 |
+
default="auto", help="OCR backend (default: auto)")
|
| 28 |
+
parser.add_argument("--output", "-o", help="Save results to JSON file")
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
|
| 31 |
+
if not Path(args.image).exists():
|
| 32 |
+
print(f"Image not found: {args.image}")
|
| 33 |
print(f"Usage: python main.py <image_path>")
|
| 34 |
sys.exit(1)
|
| 35 |
|
| 36 |
+
img = Image.open(args.image)
|
| 37 |
+
print(f"Image: {args.image} ({img.size[0]}x{img.size[1]})")
|
| 38 |
print()
|
| 39 |
|
| 40 |
+
# Use unified engine (auto-selects best backend)
|
| 41 |
+
from ocr.engine_unified import OcrEngineUnified
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
force = args.backend if args.backend != "auto" else None
|
| 44 |
+
engine = OcrEngineUnified(force_backend=force)
|
| 45 |
+
result = engine.recognize_pil(img)
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
print(f"=== Backend: {engine.backend_name.upper()} ===")
|
| 48 |
+
print(f"Text: {result.text}")
|
| 49 |
+
print(f"Lines: {len(result.lines)}, Confidence: {result.average_confidence:.1%}")
|
| 50 |
+
if result.text_angle is not None:
|
| 51 |
+
print(f"Angle: {result.text_angle:.1f}")
|
| 52 |
+
print()
|
|
|
|
| 53 |
|
| 54 |
+
for i, line in enumerate(result.lines):
|
| 55 |
+
words = " | ".join(
|
| 56 |
+
f"{w.text} ({w.confidence:.0%})" for w in line.words
|
| 57 |
+
)
|
| 58 |
+
print(f" L{i}: {words}")
|
| 59 |
|
| 60 |
+
# Save JSON if requested
|
| 61 |
+
if args.output:
|
| 62 |
+
import json
|
| 63 |
+
data = {
|
| 64 |
+
"backend": engine.backend_name,
|
| 65 |
+
"text": result.text,
|
| 66 |
+
"text_angle": result.text_angle,
|
| 67 |
+
"lines": [
|
| 68 |
+
{
|
| 69 |
+
"text": line.text,
|
| 70 |
+
"words": [
|
| 71 |
+
{"text": w.text, "confidence": w.confidence}
|
| 72 |
+
for w in line.words
|
| 73 |
+
]
|
| 74 |
+
}
|
| 75 |
+
for line in result.lines
|
| 76 |
+
],
|
| 77 |
+
}
|
| 78 |
+
Path(args.output).write_text(json.dumps(data, indent=2, ensure_ascii=False))
|
| 79 |
+
print(f"\nResults saved to {args.output}")
|
| 80 |
|
| 81 |
|
| 82 |
if __name__ == "__main__":
|
ocr/__init__.py
CHANGED
|
@@ -12,7 +12,12 @@ try:
|
|
| 12 |
except ImportError:
|
| 13 |
OcrEngineOnnx = None # type: ignore[assignment, misc]
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
__all__ = [
|
| 16 |
-
"OcrEngine", "OcrEngineOnnx",
|
| 17 |
"OcrResult", "OcrLine", "OcrWord", "BoundingRect",
|
| 18 |
]
|
|
|
|
| 12 |
except ImportError:
|
| 13 |
OcrEngineOnnx = None # type: ignore[assignment, misc]
|
| 14 |
|
| 15 |
+
try:
|
| 16 |
+
from ocr.engine_unified import OcrEngineUnified
|
| 17 |
+
except ImportError:
|
| 18 |
+
OcrEngineUnified = None # type: ignore[assignment, misc]
|
| 19 |
+
|
| 20 |
__all__ = [
|
| 21 |
+
"OcrEngine", "OcrEngineOnnx", "OcrEngineUnified",
|
| 22 |
"OcrResult", "OcrLine", "OcrWord", "BoundingRect",
|
| 23 |
]
|
ocr/engine_unified.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OCR engine — unified wrapper providing 100% accuracy on any platform.
|
| 2 |
+
|
| 3 |
+
Backend selection (automatic):
|
| 4 |
+
1. Windows → native DLL via ctypes (fastest, 100% accuracy)
|
| 5 |
+
2. Linux/macOS with Wine → DLL via Wine subprocess (100% accuracy)
|
| 6 |
+
3. Fallback → pure Python/ONNX reimplementation (~53% match rate)
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
from ocr.engine_unified import OcrEngineUnified
|
| 10 |
+
engine = OcrEngineUnified()
|
| 11 |
+
result = engine.recognize_pil(pil_image)
|
| 12 |
+
print(result.text)
|
| 13 |
+
print(f"Backend: {engine.backend_name}")
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
import logging
|
| 20 |
+
import platform
|
| 21 |
+
import sys
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import TYPE_CHECKING
|
| 24 |
+
|
| 25 |
+
from ocr.models import BoundingRect, OcrLine, OcrResult, OcrWord
|
| 26 |
+
|
| 27 |
+
if TYPE_CHECKING:
|
| 28 |
+
from PIL import Image
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class OcrEngineUnified:
|
| 34 |
+
"""Unified OCR engine — auto-selects the best available backend.
|
| 35 |
+
|
| 36 |
+
Priority order:
|
| 37 |
+
1. Native Windows DLL (100%, fastest)
|
| 38 |
+
2. Wine bridge on Linux (100%, ~2x slower due to subprocess)
|
| 39 |
+
3. ONNX reimplementation (~53%, fully cross-platform)
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
ocr_data_dir: Path to directory with DLL/model files.
|
| 43 |
+
Defaults to PROJECT_ROOT/ocr_data/.
|
| 44 |
+
force_backend: Force a specific backend: 'dll', 'wine', 'onnx', or None (auto).
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
BACKENDS = ("dll", "wine", "onnx")
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
ocr_data_dir: str | Path | None = None,
|
| 52 |
+
force_backend: str | None = None,
|
| 53 |
+
) -> None:
|
| 54 |
+
if ocr_data_dir is None:
|
| 55 |
+
ocr_data_dir = Path(__file__).resolve().parent.parent / "ocr_data"
|
| 56 |
+
self._ocr_data = Path(ocr_data_dir)
|
| 57 |
+
self._backend_name: str = "none"
|
| 58 |
+
self._engine = None
|
| 59 |
+
|
| 60 |
+
if force_backend:
|
| 61 |
+
if force_backend not in self.BACKENDS:
|
| 62 |
+
raise ValueError(f"Unknown backend: {force_backend!r}. Choose from {self.BACKENDS}")
|
| 63 |
+
self._init_backend(force_backend)
|
| 64 |
+
else:
|
| 65 |
+
self._auto_select()
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def backend_name(self) -> str:
|
| 69 |
+
"""Name of the active backend."""
|
| 70 |
+
return self._backend_name
|
| 71 |
+
|
| 72 |
+
def recognize_pil(self, image: "Image.Image") -> OcrResult:
|
| 73 |
+
"""Run OCR on a PIL Image. Returns OcrResult with text, lines, words."""
|
| 74 |
+
if self._backend_name == "dll":
|
| 75 |
+
return self._engine.recognize_pil(image)
|
| 76 |
+
elif self._backend_name == "wine":
|
| 77 |
+
return self._recognize_wine(image)
|
| 78 |
+
elif self._backend_name == "onnx":
|
| 79 |
+
return self._engine.recognize_pil(image)
|
| 80 |
+
else:
|
| 81 |
+
return OcrResult(error="No OCR backend available")
|
| 82 |
+
|
| 83 |
+
def recognize_bytes(self, image_bytes: bytes) -> OcrResult:
|
| 84 |
+
"""Run OCR on raw image bytes (PNG/JPEG/etc)."""
|
| 85 |
+
from io import BytesIO
|
| 86 |
+
from PIL import Image as PILImage
|
| 87 |
+
img = PILImage.open(BytesIO(image_bytes))
|
| 88 |
+
return self.recognize_pil(img)
|
| 89 |
+
|
| 90 |
+
# ── Backend initialization ──────────────────────────────────
|
| 91 |
+
|
| 92 |
+
def _auto_select(self) -> None:
|
| 93 |
+
"""Try backends in priority order."""
|
| 94 |
+
for backend in self.BACKENDS:
|
| 95 |
+
try:
|
| 96 |
+
self._init_backend(backend)
|
| 97 |
+
logger.info("OCR backend: %s", self._backend_name)
|
| 98 |
+
return
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.debug("Backend %s unavailable: %s", backend, e)
|
| 101 |
+
|
| 102 |
+
logger.warning("No OCR backend available!")
|
| 103 |
+
self._backend_name = "none"
|
| 104 |
+
|
| 105 |
+
def _init_backend(self, name: str) -> None:
|
| 106 |
+
"""Initialize a specific backend."""
|
| 107 |
+
if name == "dll":
|
| 108 |
+
self._init_dll()
|
| 109 |
+
elif name == "wine":
|
| 110 |
+
self._init_wine()
|
| 111 |
+
elif name == "onnx":
|
| 112 |
+
self._init_onnx()
|
| 113 |
+
|
| 114 |
+
def _init_dll(self) -> None:
|
| 115 |
+
"""Initialize native Windows DLL backend."""
|
| 116 |
+
if platform.system() != "Windows":
|
| 117 |
+
raise RuntimeError("DLL backend requires Windows")
|
| 118 |
+
from ocr.engine import OcrEngine
|
| 119 |
+
self._engine = OcrEngine(ocr_data_dir=self._ocr_data)
|
| 120 |
+
self._backend_name = "dll"
|
| 121 |
+
|
| 122 |
+
def _init_wine(self) -> None:
|
| 123 |
+
"""Initialize Wine bridge backend."""
|
| 124 |
+
if platform.system() == "Windows":
|
| 125 |
+
raise RuntimeError("Wine backend is for Linux/macOS only")
|
| 126 |
+
|
| 127 |
+
# Import and check requirements
|
| 128 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "tools"))
|
| 129 |
+
from wine_bridge import WineBridge
|
| 130 |
+
|
| 131 |
+
bridge = WineBridge(ocr_data_dir=self._ocr_data)
|
| 132 |
+
checks = bridge.check_requirements()
|
| 133 |
+
|
| 134 |
+
if not checks["wine_found"]:
|
| 135 |
+
raise RuntimeError("Wine not installed")
|
| 136 |
+
if not checks["dll_exists"]:
|
| 137 |
+
raise RuntimeError(f"oneocr.dll not found in {self._ocr_data}")
|
| 138 |
+
if not checks["model_exists"]:
|
| 139 |
+
raise RuntimeError(f"oneocr.onemodel not found in {self._ocr_data}")
|
| 140 |
+
|
| 141 |
+
# Compile loader if needed
|
| 142 |
+
if not checks["loader_compiled"]:
|
| 143 |
+
if not checks["mingw_found"]:
|
| 144 |
+
raise RuntimeError(
|
| 145 |
+
"MinGW cross-compiler needed to build Wine loader. "
|
| 146 |
+
"Install: sudo apt install mingw-w64"
|
| 147 |
+
)
|
| 148 |
+
bridge.compile_loader()
|
| 149 |
+
|
| 150 |
+
self._engine = bridge
|
| 151 |
+
self._backend_name = "wine"
|
| 152 |
+
|
| 153 |
+
def _init_onnx(self) -> None:
|
| 154 |
+
"""Initialize pure ONNX backend (fallback)."""
|
| 155 |
+
from ocr.engine_onnx import OcrEngineOnnx
|
| 156 |
+
self._engine = OcrEngineOnnx(ocr_data_dir=self._ocr_data)
|
| 157 |
+
self._backend_name = "onnx"
|
| 158 |
+
|
| 159 |
+
# ── Wine result conversion ─────────────────────────────────
|
| 160 |
+
|
| 161 |
+
def _recognize_wine(self, image: "Image.Image") -> OcrResult:
|
| 162 |
+
"""Run OCR via Wine bridge and convert JSON → OcrResult."""
|
| 163 |
+
try:
|
| 164 |
+
raw = self._engine.recognize_pil(image)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
return OcrResult(error=f"Wine bridge error: {e}")
|
| 167 |
+
|
| 168 |
+
return self._json_to_ocr_result(raw)
|
| 169 |
+
|
| 170 |
+
@staticmethod
|
| 171 |
+
def _json_to_ocr_result(data: dict) -> OcrResult:
|
| 172 |
+
"""Convert Wine bridge JSON output to OcrResult dataclass."""
|
| 173 |
+
if "error" in data:
|
| 174 |
+
return OcrResult(error=data["error"])
|
| 175 |
+
|
| 176 |
+
lines = []
|
| 177 |
+
for line_data in data.get("lines", []):
|
| 178 |
+
words = []
|
| 179 |
+
for word_data in line_data.get("words", []):
|
| 180 |
+
bbox = word_data.get("bbox", [0]*8)
|
| 181 |
+
words.append(OcrWord(
|
| 182 |
+
text=word_data.get("text", ""),
|
| 183 |
+
confidence=word_data.get("confidence", 0.0),
|
| 184 |
+
bounding_rect=BoundingRect(
|
| 185 |
+
x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3],
|
| 186 |
+
x3=bbox[4], y3=bbox[5], x4=bbox[6], y4=bbox[7],
|
| 187 |
+
),
|
| 188 |
+
))
|
| 189 |
+
|
| 190 |
+
line_bbox = line_data.get("bbox", [0]*8)
|
| 191 |
+
lines.append(OcrLine(
|
| 192 |
+
text=line_data.get("text", ""),
|
| 193 |
+
words=words,
|
| 194 |
+
bounding_rect=BoundingRect(
|
| 195 |
+
x1=line_bbox[0], y1=line_bbox[1],
|
| 196 |
+
x2=line_bbox[2], y2=line_bbox[3],
|
| 197 |
+
x3=line_bbox[4], y3=line_bbox[5],
|
| 198 |
+
x4=line_bbox[6] if len(line_bbox) > 6 else 0,
|
| 199 |
+
y4=line_bbox[7] if len(line_bbox) > 7 else 0,
|
| 200 |
+
),
|
| 201 |
+
))
|
| 202 |
+
|
| 203 |
+
full_text = "\n".join(line.text for line in lines if line.text)
|
| 204 |
+
text_angle = data.get("text_angle")
|
| 205 |
+
|
| 206 |
+
return OcrResult(text=full_text, text_angle=text_angle, lines=lines)
|
test_wine_colab.ipynb
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "d603dd1d",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# OneOCR — Wine Bridge Test na Linux\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"Test czy `oneocr.dll` działa na Linuxie przez Wine.\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"**Co testujemy:**\n",
|
| 13 |
+
"1. Instalacja Wine i MinGW na Ubuntu (Colab)\n",
|
| 14 |
+
"2. Kompilacja C loadera (`oneocr_loader.exe`)\n",
|
| 15 |
+
"3. Uruchomienie DLL przez Wine → OCR na obrazie testowym\n",
|
| 16 |
+
"4. Porównanie wyników z oczekiwanymi"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": null,
|
| 22 |
+
"id": "2d700e20",
|
| 23 |
+
"metadata": {},
|
| 24 |
+
"outputs": [],
|
| 25 |
+
"source": [
|
| 26 |
+
"# 1. Install Wine + MinGW\n",
|
| 27 |
+
"!dpkg --add-architecture i386\n",
|
| 28 |
+
"!apt-get update -qq\n",
|
| 29 |
+
"!apt-get install -y -qq wine64 mingw-w64 > /dev/null 2>&1\n",
|
| 30 |
+
"!wine64 --version"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
| 36 |
+
"id": "f0e8cfb5",
|
| 37 |
+
"metadata": {},
|
| 38 |
+
"outputs": [],
|
| 39 |
+
"source": [
|
| 40 |
+
"# 2. Initialize Wine prefix (suppress noise)\n",
|
| 41 |
+
"import os\n",
|
| 42 |
+
"os.environ['WINEDEBUG'] = '-all'\n",
|
| 43 |
+
"os.environ['WINEPREFIX'] = '/root/.wine'\n",
|
| 44 |
+
"os.environ['WINEARCH'] = 'win64'\n",
|
| 45 |
+
"!wineboot --init 2>/dev/null\n",
|
| 46 |
+
"print('Wine prefix initialized')"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": null,
|
| 52 |
+
"id": "74d95c8f",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": [
|
| 56 |
+
"# 3. Clone repo from HuggingFace\n",
|
| 57 |
+
"!pip install -q huggingface_hub\n",
|
| 58 |
+
"!git lfs install\n",
|
| 59 |
+
"!git clone https://huggingface.co/MattyMroz/oneocr /content/oneocr\n",
|
| 60 |
+
"!ls -la /content/oneocr/ocr_data/"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": null,
|
| 66 |
+
"id": "f1a4b3c9",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"# 4. Install Python deps\n",
|
| 71 |
+
"!pip install -q pillow numpy onnxruntime"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": null,
|
| 77 |
+
"id": "1a54ced1",
|
| 78 |
+
"metadata": {},
|
| 79 |
+
"outputs": [],
|
| 80 |
+
"source": [
|
| 81 |
+
"# 5. Compile C loader with MinGW cross-compiler\n",
|
| 82 |
+
"!x86_64-w64-mingw32-gcc -O2 -o /content/oneocr/tools/oneocr_loader.exe /content/oneocr/tools/oneocr_loader.c\n",
|
| 83 |
+
"!ls -la /content/oneocr/tools/oneocr_loader.exe\n",
|
| 84 |
+
"print('C loader compiled OK')"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "code",
|
| 89 |
+
"execution_count": null,
|
| 90 |
+
"id": "7bcb8baa",
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [],
|
| 93 |
+
"source": [
|
| 94 |
+
"# 6. TEST: Run C loader via Wine on a test image\n",
|
| 95 |
+
"import subprocess, json\n",
|
| 96 |
+
"from PIL import Image\n",
|
| 97 |
+
"from pathlib import Path\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"os.environ['WINEDEBUG'] = '-all'\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"# Convert test image to BMP\n",
|
| 102 |
+
"test_img = '/content/oneocr/working_space/input/ocr_test (1).png'\n",
|
| 103 |
+
"bmp_path = '/tmp/test.bmp'\n",
|
| 104 |
+
"img = Image.open(test_img).convert('RGBA')\n",
|
| 105 |
+
"img.save(bmp_path, format='BMP')\n",
|
| 106 |
+
"print(f'Image: {img.size}')\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"# Model key\n",
|
| 109 |
+
"key = b'kj)TGtrK>f]b[Piow.gU+nC@s\"\"\"\"\"\"4'\n",
|
| 110 |
+
"key_hex = key.hex()\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# Convert paths to Wine Z: format\n",
|
| 113 |
+
"dll_dir = 'Z:' + '/content/oneocr/ocr_data'.replace('/', '\\\\\\\\')\n",
|
| 114 |
+
"bmp_wine = 'Z:' + bmp_path.replace('/', '\\\\\\\\')\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"cmd = ['wine64', '/content/oneocr/tools/oneocr_loader.exe', dll_dir, bmp_wine, key_hex]\n",
|
| 117 |
+
"print(f'Running: {\" \".join(cmd[:3])} ...')\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"print(f'Return code: {result.returncode}')\n",
|
| 122 |
+
"if result.stderr:\n",
|
| 123 |
+
" print(f'Stderr: {result.stderr[:500]}')\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"if result.returncode == 0 and result.stdout.strip():\n",
|
| 126 |
+
" data = json.loads(result.stdout.strip())\n",
|
| 127 |
+
" print(f'\\n=== SUCCESS ===')\n",
|
| 128 |
+
" print(f'Text angle: {data[\"text_angle\"]}')\n",
|
| 129 |
+
" for line in data['lines']:\n",
|
| 130 |
+
" words = ' | '.join(f\"{w['text']} ({w['confidence']:.0%})\" for w in line['words'])\n",
|
| 131 |
+
" print(f' Line: {words}')\n",
|
| 132 |
+
" print(f'\\nTotal lines: {len(data[\"lines\"])}')\n",
|
| 133 |
+
"else:\n",
|
| 134 |
+
" print('FAILED')\n",
|
| 135 |
+
" print(result.stdout[:500])"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": null,
|
| 141 |
+
"id": "f09f7fde",
|
| 142 |
+
"metadata": {},
|
| 143 |
+
"outputs": [],
|
| 144 |
+
"source": [
|
| 145 |
+
"# 7. FULL TEST: Run on ALL test images\n",
|
| 146 |
+
"import time\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"test_dir = Path('/content/oneocr/working_space/input')\n",
|
| 149 |
+
"images = sorted(test_dir.glob('*.png'))\n",
|
| 150 |
+
"print(f'Testing {len(images)} images via Wine bridge...\\n')\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"success = 0\n",
|
| 153 |
+
"fail = 0\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"for img_path in images:\n",
|
| 156 |
+
" try:\n",
|
| 157 |
+
" # Convert to BMP\n",
|
| 158 |
+
" img = Image.open(img_path).convert('RGBA')\n",
|
| 159 |
+
" img.save(bmp_path, format='BMP')\n",
|
| 160 |
+
" \n",
|
| 161 |
+
" dll_dir = 'Z:' + '/content/oneocr/ocr_data'.replace('/', '\\\\\\\\')\n",
|
| 162 |
+
" bmp_wine = 'Z:' + bmp_path.replace('/', '\\\\\\\\')\n",
|
| 163 |
+
" \n",
|
| 164 |
+
" t0 = time.time()\n",
|
| 165 |
+
" result = subprocess.run(\n",
|
| 166 |
+
" ['wine64', '/content/oneocr/tools/oneocr_loader.exe', dll_dir, bmp_wine, key_hex],\n",
|
| 167 |
+
" capture_output=True, text=True, timeout=120,\n",
|
| 168 |
+
" env={**os.environ, 'WINEDEBUG': '-all'}\n",
|
| 169 |
+
" )\n",
|
| 170 |
+
" dt = time.time() - t0\n",
|
| 171 |
+
" \n",
|
| 172 |
+
" if result.returncode == 0 and result.stdout.strip():\n",
|
| 173 |
+
" data = json.loads(result.stdout.strip())\n",
|
| 174 |
+
" n_lines = len(data['lines'])\n",
|
| 175 |
+
" text = ' | '.join(l['text'] for l in data['lines'][:3])\n",
|
| 176 |
+
" print(f' OK {img_path.name:25s} | {dt:.1f}s | {n_lines}L | {text[:50]}')\n",
|
| 177 |
+
" success += 1\n",
|
| 178 |
+
" else:\n",
|
| 179 |
+
" print(f' FAIL {img_path.name:25s} | {result.stderr[:80]}')\n",
|
| 180 |
+
" fail += 1\n",
|
| 181 |
+
" except Exception as e:\n",
|
| 182 |
+
" print(f' ERR {img_path.name:25s} | {e}')\n",
|
| 183 |
+
" fail += 1\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"print(f'\\n{\"=\" * 60}')\n",
|
| 186 |
+
"print(f'Result: {success}/{success+fail} OK ({success/(success+fail)*100:.0f}%)')\n",
|
| 187 |
+
"print('Wine bridge on Linux: ' + ('WORKS!' if fail == 0 else 'PARTIAL'))"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"cell_type": "code",
|
| 192 |
+
"execution_count": null,
|
| 193 |
+
"id": "8abf6a75",
|
| 194 |
+
"metadata": {},
|
| 195 |
+
"outputs": [],
|
| 196 |
+
"source": [
|
| 197 |
+
"# 8. TEST: Unified engine with Wine backend\n",
|
| 198 |
+
"import sys\n",
|
| 199 |
+
"sys.path.insert(0, '/content/oneocr')\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"from ocr.engine_unified import OcrEngineUnified\n",
|
| 202 |
+
"\n",
|
| 203 |
+
"engine = OcrEngineUnified()\n",
|
| 204 |
+
"print(f'Backend selected: {engine.backend_name}')\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"img = Image.open('/content/oneocr/working_space/input/ocr_test (10).png')\n",
|
| 207 |
+
"result = engine.recognize_pil(img)\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"print(f'Text: {result.text}')\n",
|
| 210 |
+
"print(f'Lines: {len(result.lines)}')\n",
|
| 211 |
+
"print(f'Confidence: {result.average_confidence:.1%}')\n",
|
| 212 |
+
"print(f'\\nDone! OneOCR DLL works on Linux via Wine.')"
|
| 213 |
+
]
|
| 214 |
+
}
|
| 215 |
+
],
|
| 216 |
+
"metadata": {
|
| 217 |
+
"language_info": {
|
| 218 |
+
"name": "python"
|
| 219 |
+
}
|
| 220 |
+
},
|
| 221 |
+
"nbformat": 4,
|
| 222 |
+
"nbformat_minor": 5
|
| 223 |
+
}
|
tools/oneocr_loader.c
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
/* oneocr_loader.c -- Minimal OneOCR DLL loader for Wine
|
| 3 |
+
* Compile: x86_64-w64-mingw32-gcc -O2 -o oneocr_loader.exe oneocr_loader.c
|
| 4 |
+
* Usage: wine oneocr_loader.exe <dll_dir> <image_bmp> <model_key>
|
| 5 |
+
* Output: JSON to stdout
|
| 6 |
+
*/
|
| 7 |
+
#include <stdio.h>
|
| 8 |
+
#include <stdlib.h>
|
| 9 |
+
#include <string.h>
|
| 10 |
+
#include <windows.h>
|
| 11 |
+
|
| 12 |
+
/* DLL function types */
|
| 13 |
+
typedef long long (*fn_CreateOcrInitOptions)(long long*);
|
| 14 |
+
typedef long long (*fn_OcrInitOptionsSetUseModelDelayLoad)(long long, char);
|
| 15 |
+
typedef long long (*fn_CreateOcrPipeline)(const char*, const char*, long long, long long*);
|
| 16 |
+
typedef long long (*fn_CreateOcrProcessOptions)(long long*);
|
| 17 |
+
typedef long long (*fn_OcrProcessOptionsSetMaxRecognitionLineCount)(long long, long long);
|
| 18 |
+
typedef long long (*fn_RunOcrPipeline)(long long, void*, long long, long long*);
|
| 19 |
+
typedef long long (*fn_GetImageAngle)(long long, float*);
|
| 20 |
+
typedef long long (*fn_GetOcrLineCount)(long long, long long*);
|
| 21 |
+
typedef long long (*fn_GetOcrLine)(long long, long long, long long*);
|
| 22 |
+
typedef long long (*fn_GetOcrLineContent)(long long, const char**);
|
| 23 |
+
typedef long long (*fn_GetOcrLineBoundingBox)(long long, void**);
|
| 24 |
+
typedef long long (*fn_GetOcrLineWordCount)(long long, long long*);
|
| 25 |
+
typedef long long (*fn_GetOcrWord)(long long, long long, long long*);
|
| 26 |
+
typedef long long (*fn_GetOcrWordContent)(long long, const char**);
|
| 27 |
+
typedef long long (*fn_GetOcrWordBoundingBox)(long long, void**);
|
| 28 |
+
typedef long long (*fn_GetOcrWordConfidence)(long long, float*);
|
| 29 |
+
typedef void (*fn_ReleaseOcrResult)(long long);
|
| 30 |
+
typedef void (*fn_ReleaseOcrInitOptions)(long long);
|
| 31 |
+
typedef void (*fn_ReleaseOcrPipeline)(long long);
|
| 32 |
+
typedef void (*fn_ReleaseOcrProcessOptions)(long long);
|
| 33 |
+
|
| 34 |
+
#pragma pack(push, 1)
|
| 35 |
+
typedef struct {
|
| 36 |
+
int type; /* 3 = BGRA 4-channel (matches engine.py) */
|
| 37 |
+
int width;
|
| 38 |
+
int height;
|
| 39 |
+
int reserved;
|
| 40 |
+
long long step;
|
| 41 |
+
unsigned char *data;
|
| 42 |
+
} ImageStruct;
|
| 43 |
+
|
| 44 |
+
typedef struct {
|
| 45 |
+
float x1, y1, x2, y2, x3, y3, x4, y4;
|
| 46 |
+
} BBox;
|
| 47 |
+
#pragma pack(pop)
|
| 48 |
+
|
| 49 |
+
/* Simple BMP loader (32-bit BGRA) */
|
| 50 |
+
static unsigned char* load_bmp_bgra(const char* path, int* w, int* h) {
|
| 51 |
+
FILE* f = fopen(path, "rb");
|
| 52 |
+
if (!f) return NULL;
|
| 53 |
+
|
| 54 |
+
unsigned char header[54];
|
| 55 |
+
fread(header, 1, 54, f);
|
| 56 |
+
|
| 57 |
+
*w = *(int*)(header + 18);
|
| 58 |
+
*h = *(int*)(header + 22);
|
| 59 |
+
int bpp = *(short*)(header + 28);
|
| 60 |
+
int offset = *(int*)(header + 10);
|
| 61 |
+
int abs_h = *h < 0 ? -*h : *h;
|
| 62 |
+
|
| 63 |
+
fseek(f, offset, SEEK_SET);
|
| 64 |
+
|
| 65 |
+
/* Allocate BGRA buffer */
|
| 66 |
+
unsigned char* bgra = (unsigned char*)malloc((*w) * abs_h * 4);
|
| 67 |
+
|
| 68 |
+
if (bpp == 24) {
|
| 69 |
+
int row_size = ((*w * 3 + 3) & ~3);
|
| 70 |
+
unsigned char* row = (unsigned char*)malloc(row_size);
|
| 71 |
+
for (int y = 0; y < abs_h; y++) {
|
| 72 |
+
int dest_y = (*h > 0) ? (abs_h - 1 - y) : y;
|
| 73 |
+
fread(row, 1, row_size, f);
|
| 74 |
+
for (int x = 0; x < *w; x++) {
|
| 75 |
+
bgra[(dest_y * *w + x) * 4 + 0] = row[x * 3 + 0]; /* B */
|
| 76 |
+
bgra[(dest_y * *w + x) * 4 + 1] = row[x * 3 + 1]; /* G */
|
| 77 |
+
bgra[(dest_y * *w + x) * 4 + 2] = row[x * 3 + 2]; /* R */
|
| 78 |
+
bgra[(dest_y * *w + x) * 4 + 3] = 255; /* A */
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
free(row);
|
| 82 |
+
} else if (bpp == 32) {
|
| 83 |
+
for (int y = 0; y < abs_h; y++) {
|
| 84 |
+
int dest_y = (*h > 0) ? (abs_h - 1 - y) : y;
|
| 85 |
+
fread(bgra + dest_y * *w * 4, 1, *w * 4, f);
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
*h = abs_h;
|
| 90 |
+
fclose(f);
|
| 91 |
+
return bgra;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/* Escape JSON string */
|
| 95 |
+
static void json_escape(const char* s, char* out, int max) {
|
| 96 |
+
int j = 0;
|
| 97 |
+
out[j++] = '"';
|
| 98 |
+
for (int i = 0; s[i] && j < max - 3; i++) {
|
| 99 |
+
if (s[i] == '"') { out[j++] = '\\'; out[j++] = '"'; }
|
| 100 |
+
else if (s[i] == '\\') { out[j++] = '\\'; out[j++] = '\\'; }
|
| 101 |
+
else if (s[i] == '\n') { out[j++] = '\\'; out[j++] = 'n'; }
|
| 102 |
+
else if (s[i] == '\r') { out[j++] = '\\'; out[j++] = 'r'; }
|
| 103 |
+
else if (s[i] == '\t') { out[j++] = '\\'; out[j++] = 't'; }
|
| 104 |
+
else out[j++] = s[i];
|
| 105 |
+
}
|
| 106 |
+
out[j++] = '"';
|
| 107 |
+
out[j] = 0;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
int main(int argc, char** argv) {
|
| 111 |
+
if (argc < 4) {
|
| 112 |
+
fprintf(stderr, "Usage: %s <dll_dir> <image.bmp> <model_key_hex>\n", argv[0]);
|
| 113 |
+
return 1;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
const char* dll_dir = argv[1];
|
| 117 |
+
const char* img_path = argv[2];
|
| 118 |
+
const char* key_hex = argv[3];
|
| 119 |
+
|
| 120 |
+
/* Set DLL search path */
|
| 121 |
+
SetDllDirectoryA(dll_dir);
|
| 122 |
+
char old_path[32768];
|
| 123 |
+
GetEnvironmentVariableA("PATH", old_path, sizeof(old_path));
|
| 124 |
+
char new_path[32768];
|
| 125 |
+
snprintf(new_path, sizeof(new_path), "%s;%s", dll_dir, old_path);
|
| 126 |
+
SetEnvironmentVariableA("PATH", new_path);
|
| 127 |
+
|
| 128 |
+
/* Load DLL */
|
| 129 |
+
char dll_path[MAX_PATH];
|
| 130 |
+
snprintf(dll_path, sizeof(dll_path), "%s\\oneocr.dll", dll_dir);
|
| 131 |
+
|
| 132 |
+
HMODULE hmod = LoadLibraryA(dll_path);
|
| 133 |
+
if (!hmod) {
|
| 134 |
+
fprintf(stderr, "{\"error\": \"LoadLibrary failed: %lu\"}\n", GetLastError());
|
| 135 |
+
return 1;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
/* Get function pointers */
|
| 139 |
+
#define GETFN(name) fn_##name p##name = (fn_##name)GetProcAddress(hmod, #name); \
|
| 140 |
+
if (!p##name) { fprintf(stderr, "{\"error\": \"GetProcAddress(%s) failed\"}\n", #name); return 1; }
|
| 141 |
+
|
| 142 |
+
GETFN(CreateOcrInitOptions)
|
| 143 |
+
GETFN(OcrInitOptionsSetUseModelDelayLoad)
|
| 144 |
+
GETFN(CreateOcrPipeline)
|
| 145 |
+
GETFN(CreateOcrProcessOptions)
|
| 146 |
+
GETFN(OcrProcessOptionsSetMaxRecognitionLineCount)
|
| 147 |
+
GETFN(RunOcrPipeline)
|
| 148 |
+
GETFN(GetImageAngle)
|
| 149 |
+
GETFN(GetOcrLineCount)
|
| 150 |
+
GETFN(GetOcrLine)
|
| 151 |
+
GETFN(GetOcrLineContent)
|
| 152 |
+
GETFN(GetOcrLineBoundingBox)
|
| 153 |
+
GETFN(GetOcrLineWordCount)
|
| 154 |
+
GETFN(GetOcrWord)
|
| 155 |
+
GETFN(GetOcrWordContent)
|
| 156 |
+
GETFN(GetOcrWordBoundingBox)
|
| 157 |
+
GETFN(GetOcrWordConfidence)
|
| 158 |
+
GETFN(ReleaseOcrResult)
|
| 159 |
+
GETFN(ReleaseOcrInitOptions)
|
| 160 |
+
GETFN(ReleaseOcrPipeline)
|
| 161 |
+
GETFN(ReleaseOcrProcessOptions)
|
| 162 |
+
|
| 163 |
+
/* Model path and key */
|
| 164 |
+
char model_path[MAX_PATH];
|
| 165 |
+
snprintf(model_path, sizeof(model_path), "%s\\oneocr.onemodel", dll_dir);
|
| 166 |
+
|
| 167 |
+
/* Decode hex key */
|
| 168 |
+
int key_len = strlen(key_hex) / 2;
|
| 169 |
+
char key[64];
|
| 170 |
+
for (int i = 0; i < key_len && i < 63; i++) {
|
| 171 |
+
sscanf(key_hex + i*2, "%2hhx", &key[i]);
|
| 172 |
+
}
|
| 173 |
+
key[key_len] = 0;
|
| 174 |
+
|
| 175 |
+
/* Initialize pipeline */
|
| 176 |
+
long long init_opts = 0;
|
| 177 |
+
pCreateOcrInitOptions(&init_opts);
|
| 178 |
+
|
| 179 |
+
long long pipeline = 0;
|
| 180 |
+
long long res = pCreateOcrPipeline(model_path, key, init_opts, &pipeline);
|
| 181 |
+
if (res != 0) {
|
| 182 |
+
fprintf(stderr, "{\"error\": \"CreateOcrPipeline failed: %lld\"}\n", res);
|
| 183 |
+
return 1;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
long long proc_opts = 0;
|
| 187 |
+
pCreateOcrProcessOptions(&proc_opts);
|
| 188 |
+
pOcrProcessOptionsSetMaxRecognitionLineCount(proc_opts, 200);
|
| 189 |
+
|
| 190 |
+
/* Load image */
|
| 191 |
+
int w = 0, h = 0;
|
| 192 |
+
unsigned char* data = load_bmp_bgra(img_path, &w, &h);
|
| 193 |
+
if (!data) {
|
| 194 |
+
fprintf(stderr, "{\"error\": \"Failed to load image\"}\n");
|
| 195 |
+
return 1;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
ImageStruct img = {3, w, h, 0, (long long)(w * 4), data};
|
| 199 |
+
|
| 200 |
+
/* Run OCR */
|
| 201 |
+
long long result = 0;
|
| 202 |
+
res = pRunOcrPipeline(pipeline, &img, proc_opts, &result);
|
| 203 |
+
if (res != 0) {
|
| 204 |
+
fprintf(stderr, "{\"error\": \"RunOcrPipeline failed: %lld\"}\n", res);
|
| 205 |
+
return 1;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* Extract results */
|
| 209 |
+
float angle = 0;
|
| 210 |
+
pGetImageAngle(result, &angle);
|
| 211 |
+
|
| 212 |
+
long long line_count = 0;
|
| 213 |
+
pGetOcrLineCount(result, &line_count);
|
| 214 |
+
|
| 215 |
+
/* Output JSON */
|
| 216 |
+
char buf[65536];
|
| 217 |
+
int pos = 0;
|
| 218 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 219 |
+
"{\"text_angle\": %.4f, \"lines\": [", angle);
|
| 220 |
+
|
| 221 |
+
for (long long i = 0; i < line_count; i++) {
|
| 222 |
+
long long line = 0;
|
| 223 |
+
pGetOcrLine(result, i, &line);
|
| 224 |
+
|
| 225 |
+
const char* line_text = NULL;
|
| 226 |
+
pGetOcrLineContent(line, &line_text);
|
| 227 |
+
|
| 228 |
+
BBox* line_bbox = NULL;
|
| 229 |
+
pGetOcrLineBoundingBox(line, (void**)&line_bbox);
|
| 230 |
+
|
| 231 |
+
long long word_count = 0;
|
| 232 |
+
pGetOcrLineWordCount(line, &word_count);
|
| 233 |
+
|
| 234 |
+
if (i > 0) pos += snprintf(buf + pos, sizeof(buf) - pos, ", ");
|
| 235 |
+
|
| 236 |
+
char esc_line[4096];
|
| 237 |
+
json_escape(line_text ? line_text : "", esc_line, sizeof(esc_line));
|
| 238 |
+
|
| 239 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 240 |
+
"{\"text\": %s, \"bbox\": [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f], \"words\": [",
|
| 241 |
+
esc_line,
|
| 242 |
+
line_bbox ? line_bbox->x1 : 0, line_bbox ? line_bbox->y1 : 0,
|
| 243 |
+
line_bbox ? line_bbox->x2 : 0, line_bbox ? line_bbox->y2 : 0,
|
| 244 |
+
line_bbox ? line_bbox->x3 : 0, line_bbox ? line_bbox->y3 : 0,
|
| 245 |
+
line_bbox ? line_bbox->x4 : 0, line_bbox ? line_bbox->y4 : 0);
|
| 246 |
+
|
| 247 |
+
for (long long j = 0; j < word_count; j++) {
|
| 248 |
+
long long word = 0;
|
| 249 |
+
pGetOcrWord(line, j, &word);
|
| 250 |
+
|
| 251 |
+
const char* word_text = NULL;
|
| 252 |
+
pGetOcrWordContent(word, &word_text);
|
| 253 |
+
|
| 254 |
+
BBox* word_bbox = NULL;
|
| 255 |
+
pGetOcrWordBoundingBox(word, (void**)&word_bbox);
|
| 256 |
+
|
| 257 |
+
float word_conf = 0;
|
| 258 |
+
pGetOcrWordConfidence(word, &word_conf);
|
| 259 |
+
|
| 260 |
+
if (j > 0) pos += snprintf(buf + pos, sizeof(buf) - pos, ", ");
|
| 261 |
+
|
| 262 |
+
char esc_word[2048];
|
| 263 |
+
json_escape(word_text ? word_text : "", esc_word, sizeof(esc_word));
|
| 264 |
+
|
| 265 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 266 |
+
"{\"text\": %s, \"confidence\": %.4f, \"bbox\": [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]}",
|
| 267 |
+
esc_word, word_conf,
|
| 268 |
+
word_bbox ? word_bbox->x1 : 0, word_bbox ? word_bbox->y1 : 0,
|
| 269 |
+
word_bbox ? word_bbox->x2 : 0, word_bbox ? word_bbox->y2 : 0,
|
| 270 |
+
word_bbox ? word_bbox->x3 : 0, word_bbox ? word_bbox->y3 : 0,
|
| 271 |
+
word_bbox ? word_bbox->x4 : 0, word_bbox ? word_bbox->y4 : 0);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos, "]}");
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos, "]}");
|
| 278 |
+
|
| 279 |
+
/* Write JSON to stdout */
|
| 280 |
+
printf("%s\n", buf);
|
| 281 |
+
fflush(stdout);
|
| 282 |
+
|
| 283 |
+
/* Cleanup */
|
| 284 |
+
pReleaseOcrResult(result);
|
| 285 |
+
free(data);
|
| 286 |
+
pReleaseOcrProcessOptions(proc_opts);
|
| 287 |
+
pReleaseOcrPipeline(pipeline);
|
| 288 |
+
pReleaseOcrInitOptions(init_opts);
|
| 289 |
+
FreeLibrary(hmod);
|
| 290 |
+
|
| 291 |
+
return 0;
|
| 292 |
+
}
|
tools/oneocr_loader.exe
ADDED
|
Binary file (71 kB). View file
|
|
|
tools/wine_bridge.py
ADDED
|
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Wine Bridge — Run OneOCR DLL on Linux via Wine subprocess.
|
| 4 |
+
|
| 5 |
+
Strategy: Use Wine to run a tiny Windows Python script that loads the DLL,
|
| 6 |
+
processes an image, and returns JSON results via stdout.
|
| 7 |
+
|
| 8 |
+
This avoids ctypes-over-Wine complexity by using Wine's own Python/executable.
|
| 9 |
+
|
| 10 |
+
Architecture:
|
| 11 |
+
Linux Python ──► subprocess (wine) ──► Windows DLL loader ──► JSON stdout
|
| 12 |
+
|
| 13 |
+
Requirements on Linux:
|
| 14 |
+
- wine (>= 8.0, 64-bit prefix)
|
| 15 |
+
- Python for Windows installed in Wine prefix (or standalone exe)
|
| 16 |
+
|
| 17 |
+
Alternative: Compile a minimal C loader → .exe, ship it, run via Wine.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import platform
|
| 25 |
+
import shutil
|
| 26 |
+
import struct
|
| 27 |
+
import subprocess
|
| 28 |
+
import sys
|
| 29 |
+
import tempfile
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import TYPE_CHECKING
|
| 32 |
+
|
| 33 |
+
if TYPE_CHECKING:
|
| 34 |
+
from PIL import Image
|
| 35 |
+
|
| 36 |
+
# ─── Wine DLL Loader (C code) ─────────────────────────────────────────────
|
| 37 |
+
# This is a self-contained C program that loads oneocr.dll and runs OCR.
|
| 38 |
+
# It is compiled once on the target system using x86_64-w64-mingw32-gcc
|
| 39 |
+
# (MinGW cross-compiler available on every Linux distro).
|
| 40 |
+
|
| 41 |
+
WINE_LOADER_C = r"""
|
| 42 |
+
/* oneocr_loader.c -- Minimal OneOCR DLL loader for Wine
|
| 43 |
+
* Compile: x86_64-w64-mingw32-gcc -O2 -o oneocr_loader.exe oneocr_loader.c
|
| 44 |
+
* Usage: wine oneocr_loader.exe <dll_dir> <image_bmp> <model_key>
|
| 45 |
+
* Output: JSON to stdout
|
| 46 |
+
*/
|
| 47 |
+
#include <stdio.h>
|
| 48 |
+
#include <stdlib.h>
|
| 49 |
+
#include <string.h>
|
| 50 |
+
#include <windows.h>
|
| 51 |
+
|
| 52 |
+
/* DLL function types */
|
| 53 |
+
typedef long long (*fn_CreateOcrInitOptions)(long long*);
|
| 54 |
+
typedef long long (*fn_OcrInitOptionsSetUseModelDelayLoad)(long long, char);
|
| 55 |
+
typedef long long (*fn_CreateOcrPipeline)(const char*, const char*, long long, long long*);
|
| 56 |
+
typedef long long (*fn_CreateOcrProcessOptions)(long long*);
|
| 57 |
+
typedef long long (*fn_OcrProcessOptionsSetMaxRecognitionLineCount)(long long, long long);
|
| 58 |
+
typedef long long (*fn_RunOcrPipeline)(long long, void*, long long, long long*);
|
| 59 |
+
typedef long long (*fn_GetImageAngle)(long long, float*);
|
| 60 |
+
typedef long long (*fn_GetOcrLineCount)(long long, long long*);
|
| 61 |
+
typedef long long (*fn_GetOcrLine)(long long, long long, long long*);
|
| 62 |
+
typedef long long (*fn_GetOcrLineContent)(long long, const char**);
|
| 63 |
+
typedef long long (*fn_GetOcrLineBoundingBox)(long long, void**);
|
| 64 |
+
typedef long long (*fn_GetOcrLineWordCount)(long long, long long*);
|
| 65 |
+
typedef long long (*fn_GetOcrWord)(long long, long long, long long*);
|
| 66 |
+
typedef long long (*fn_GetOcrWordContent)(long long, const char**);
|
| 67 |
+
typedef long long (*fn_GetOcrWordBoundingBox)(long long, void**);
|
| 68 |
+
typedef long long (*fn_GetOcrWordConfidence)(long long, float*);
|
| 69 |
+
typedef void (*fn_ReleaseOcrResult)(long long);
|
| 70 |
+
typedef void (*fn_ReleaseOcrInitOptions)(long long);
|
| 71 |
+
typedef void (*fn_ReleaseOcrPipeline)(long long);
|
| 72 |
+
typedef void (*fn_ReleaseOcrProcessOptions)(long long);
|
| 73 |
+
|
| 74 |
+
#pragma pack(push, 1)
|
| 75 |
+
typedef struct {
|
| 76 |
+
int type; /* 3 = BGRA 4-channel (matches engine.py) */
|
| 77 |
+
int width;
|
| 78 |
+
int height;
|
| 79 |
+
int reserved;
|
| 80 |
+
long long step;
|
| 81 |
+
unsigned char *data;
|
| 82 |
+
} ImageStruct;
|
| 83 |
+
|
| 84 |
+
typedef struct {
|
| 85 |
+
float x1, y1, x2, y2, x3, y3, x4, y4;
|
| 86 |
+
} BBox;
|
| 87 |
+
#pragma pack(pop)
|
| 88 |
+
|
| 89 |
+
/* Simple BMP loader (32-bit BGRA) */
|
| 90 |
+
static unsigned char* load_bmp_bgra(const char* path, int* w, int* h) {
|
| 91 |
+
FILE* f = fopen(path, "rb");
|
| 92 |
+
if (!f) return NULL;
|
| 93 |
+
|
| 94 |
+
unsigned char header[54];
|
| 95 |
+
fread(header, 1, 54, f);
|
| 96 |
+
|
| 97 |
+
*w = *(int*)(header + 18);
|
| 98 |
+
*h = *(int*)(header + 22);
|
| 99 |
+
int bpp = *(short*)(header + 28);
|
| 100 |
+
int offset = *(int*)(header + 10);
|
| 101 |
+
int abs_h = *h < 0 ? -*h : *h;
|
| 102 |
+
|
| 103 |
+
fseek(f, offset, SEEK_SET);
|
| 104 |
+
|
| 105 |
+
/* Allocate BGRA buffer */
|
| 106 |
+
unsigned char* bgra = (unsigned char*)malloc((*w) * abs_h * 4);
|
| 107 |
+
|
| 108 |
+
if (bpp == 24) {
|
| 109 |
+
int row_size = ((*w * 3 + 3) & ~3);
|
| 110 |
+
unsigned char* row = (unsigned char*)malloc(row_size);
|
| 111 |
+
for (int y = 0; y < abs_h; y++) {
|
| 112 |
+
int dest_y = (*h > 0) ? (abs_h - 1 - y) : y;
|
| 113 |
+
fread(row, 1, row_size, f);
|
| 114 |
+
for (int x = 0; x < *w; x++) {
|
| 115 |
+
bgra[(dest_y * *w + x) * 4 + 0] = row[x * 3 + 0]; /* B */
|
| 116 |
+
bgra[(dest_y * *w + x) * 4 + 1] = row[x * 3 + 1]; /* G */
|
| 117 |
+
bgra[(dest_y * *w + x) * 4 + 2] = row[x * 3 + 2]; /* R */
|
| 118 |
+
bgra[(dest_y * *w + x) * 4 + 3] = 255; /* A */
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
free(row);
|
| 122 |
+
} else if (bpp == 32) {
|
| 123 |
+
for (int y = 0; y < abs_h; y++) {
|
| 124 |
+
int dest_y = (*h > 0) ? (abs_h - 1 - y) : y;
|
| 125 |
+
fread(bgra + dest_y * *w * 4, 1, *w * 4, f);
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
*h = abs_h;
|
| 130 |
+
fclose(f);
|
| 131 |
+
return bgra;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
/* Escape JSON string */
|
| 135 |
+
static void json_escape(const char* s, char* out, int max) {
|
| 136 |
+
int j = 0;
|
| 137 |
+
out[j++] = '"';
|
| 138 |
+
for (int i = 0; s[i] && j < max - 3; i++) {
|
| 139 |
+
if (s[i] == '"') { out[j++] = '\\'; out[j++] = '"'; }
|
| 140 |
+
else if (s[i] == '\\') { out[j++] = '\\'; out[j++] = '\\'; }
|
| 141 |
+
else if (s[i] == '\n') { out[j++] = '\\'; out[j++] = 'n'; }
|
| 142 |
+
else if (s[i] == '\r') { out[j++] = '\\'; out[j++] = 'r'; }
|
| 143 |
+
else if (s[i] == '\t') { out[j++] = '\\'; out[j++] = 't'; }
|
| 144 |
+
else out[j++] = s[i];
|
| 145 |
+
}
|
| 146 |
+
out[j++] = '"';
|
| 147 |
+
out[j] = 0;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
int main(int argc, char** argv) {
|
| 151 |
+
if (argc < 4) {
|
| 152 |
+
fprintf(stderr, "Usage: %s <dll_dir> <image.bmp> <model_key_hex>\n", argv[0]);
|
| 153 |
+
return 1;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
const char* dll_dir = argv[1];
|
| 157 |
+
const char* img_path = argv[2];
|
| 158 |
+
const char* key_hex = argv[3];
|
| 159 |
+
|
| 160 |
+
/* Set DLL search path */
|
| 161 |
+
SetDllDirectoryA(dll_dir);
|
| 162 |
+
char old_path[32768];
|
| 163 |
+
GetEnvironmentVariableA("PATH", old_path, sizeof(old_path));
|
| 164 |
+
char new_path[32768];
|
| 165 |
+
snprintf(new_path, sizeof(new_path), "%s;%s", dll_dir, old_path);
|
| 166 |
+
SetEnvironmentVariableA("PATH", new_path);
|
| 167 |
+
|
| 168 |
+
/* Load DLL */
|
| 169 |
+
char dll_path[MAX_PATH];
|
| 170 |
+
snprintf(dll_path, sizeof(dll_path), "%s\\oneocr.dll", dll_dir);
|
| 171 |
+
|
| 172 |
+
HMODULE hmod = LoadLibraryA(dll_path);
|
| 173 |
+
if (!hmod) {
|
| 174 |
+
fprintf(stderr, "{\"error\": \"LoadLibrary failed: %lu\"}\n", GetLastError());
|
| 175 |
+
return 1;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
/* Get function pointers */
|
| 179 |
+
#define GETFN(name) fn_##name p##name = (fn_##name)GetProcAddress(hmod, #name); \
|
| 180 |
+
if (!p##name) { fprintf(stderr, "{\"error\": \"GetProcAddress(%s) failed\"}\n", #name); return 1; }
|
| 181 |
+
|
| 182 |
+
GETFN(CreateOcrInitOptions)
|
| 183 |
+
GETFN(OcrInitOptionsSetUseModelDelayLoad)
|
| 184 |
+
GETFN(CreateOcrPipeline)
|
| 185 |
+
GETFN(CreateOcrProcessOptions)
|
| 186 |
+
GETFN(OcrProcessOptionsSetMaxRecognitionLineCount)
|
| 187 |
+
GETFN(RunOcrPipeline)
|
| 188 |
+
GETFN(GetImageAngle)
|
| 189 |
+
GETFN(GetOcrLineCount)
|
| 190 |
+
GETFN(GetOcrLine)
|
| 191 |
+
GETFN(GetOcrLineContent)
|
| 192 |
+
GETFN(GetOcrLineBoundingBox)
|
| 193 |
+
GETFN(GetOcrLineWordCount)
|
| 194 |
+
GETFN(GetOcrWord)
|
| 195 |
+
GETFN(GetOcrWordContent)
|
| 196 |
+
GETFN(GetOcrWordBoundingBox)
|
| 197 |
+
GETFN(GetOcrWordConfidence)
|
| 198 |
+
GETFN(ReleaseOcrResult)
|
| 199 |
+
GETFN(ReleaseOcrInitOptions)
|
| 200 |
+
GETFN(ReleaseOcrPipeline)
|
| 201 |
+
GETFN(ReleaseOcrProcessOptions)
|
| 202 |
+
|
| 203 |
+
/* Model path and key */
|
| 204 |
+
char model_path[MAX_PATH];
|
| 205 |
+
snprintf(model_path, sizeof(model_path), "%s\\oneocr.onemodel", dll_dir);
|
| 206 |
+
|
| 207 |
+
/* Decode hex key */
|
| 208 |
+
int key_len = strlen(key_hex) / 2;
|
| 209 |
+
char key[64];
|
| 210 |
+
for (int i = 0; i < key_len && i < 63; i++) {
|
| 211 |
+
sscanf(key_hex + i*2, "%2hhx", &key[i]);
|
| 212 |
+
}
|
| 213 |
+
key[key_len] = 0;
|
| 214 |
+
|
| 215 |
+
/* Initialize pipeline */
|
| 216 |
+
long long init_opts = 0;
|
| 217 |
+
pCreateOcrInitOptions(&init_opts);
|
| 218 |
+
|
| 219 |
+
long long pipeline = 0;
|
| 220 |
+
long long res = pCreateOcrPipeline(model_path, key, init_opts, &pipeline);
|
| 221 |
+
if (res != 0) {
|
| 222 |
+
fprintf(stderr, "{\"error\": \"CreateOcrPipeline failed: %lld\"}\n", res);
|
| 223 |
+
return 1;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
long long proc_opts = 0;
|
| 227 |
+
pCreateOcrProcessOptions(&proc_opts);
|
| 228 |
+
pOcrProcessOptionsSetMaxRecognitionLineCount(proc_opts, 200);
|
| 229 |
+
|
| 230 |
+
/* Load image */
|
| 231 |
+
int w = 0, h = 0;
|
| 232 |
+
unsigned char* data = load_bmp_bgra(img_path, &w, &h);
|
| 233 |
+
if (!data) {
|
| 234 |
+
fprintf(stderr, "{\"error\": \"Failed to load image\"}\n");
|
| 235 |
+
return 1;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
ImageStruct img = {3, w, h, 0, (long long)(w * 4), data};
|
| 239 |
+
|
| 240 |
+
/* Run OCR */
|
| 241 |
+
long long result = 0;
|
| 242 |
+
res = pRunOcrPipeline(pipeline, &img, proc_opts, &result);
|
| 243 |
+
if (res != 0) {
|
| 244 |
+
fprintf(stderr, "{\"error\": \"RunOcrPipeline failed: %lld\"}\n", res);
|
| 245 |
+
return 1;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
/* Extract results */
|
| 249 |
+
float angle = 0;
|
| 250 |
+
pGetImageAngle(result, &angle);
|
| 251 |
+
|
| 252 |
+
long long line_count = 0;
|
| 253 |
+
pGetOcrLineCount(result, &line_count);
|
| 254 |
+
|
| 255 |
+
/* Output JSON */
|
| 256 |
+
char buf[65536];
|
| 257 |
+
int pos = 0;
|
| 258 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 259 |
+
"{\"text_angle\": %.4f, \"lines\": [", angle);
|
| 260 |
+
|
| 261 |
+
for (long long i = 0; i < line_count; i++) {
|
| 262 |
+
long long line = 0;
|
| 263 |
+
pGetOcrLine(result, i, &line);
|
| 264 |
+
|
| 265 |
+
const char* line_text = NULL;
|
| 266 |
+
pGetOcrLineContent(line, &line_text);
|
| 267 |
+
|
| 268 |
+
BBox* line_bbox = NULL;
|
| 269 |
+
pGetOcrLineBoundingBox(line, (void**)&line_bbox);
|
| 270 |
+
|
| 271 |
+
long long word_count = 0;
|
| 272 |
+
pGetOcrLineWordCount(line, &word_count);
|
| 273 |
+
|
| 274 |
+
if (i > 0) pos += snprintf(buf + pos, sizeof(buf) - pos, ", ");
|
| 275 |
+
|
| 276 |
+
char esc_line[4096];
|
| 277 |
+
json_escape(line_text ? line_text : "", esc_line, sizeof(esc_line));
|
| 278 |
+
|
| 279 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 280 |
+
"{\"text\": %s, \"bbox\": [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f], \"words\": [",
|
| 281 |
+
esc_line,
|
| 282 |
+
line_bbox ? line_bbox->x1 : 0, line_bbox ? line_bbox->y1 : 0,
|
| 283 |
+
line_bbox ? line_bbox->x2 : 0, line_bbox ? line_bbox->y2 : 0,
|
| 284 |
+
line_bbox ? line_bbox->x3 : 0, line_bbox ? line_bbox->y3 : 0,
|
| 285 |
+
line_bbox ? line_bbox->x4 : 0, line_bbox ? line_bbox->y4 : 0);
|
| 286 |
+
|
| 287 |
+
for (long long j = 0; j < word_count; j++) {
|
| 288 |
+
long long word = 0;
|
| 289 |
+
pGetOcrWord(line, j, &word);
|
| 290 |
+
|
| 291 |
+
const char* word_text = NULL;
|
| 292 |
+
pGetOcrWordContent(word, &word_text);
|
| 293 |
+
|
| 294 |
+
BBox* word_bbox = NULL;
|
| 295 |
+
pGetOcrWordBoundingBox(word, (void**)&word_bbox);
|
| 296 |
+
|
| 297 |
+
float word_conf = 0;
|
| 298 |
+
pGetOcrWordConfidence(word, &word_conf);
|
| 299 |
+
|
| 300 |
+
if (j > 0) pos += snprintf(buf + pos, sizeof(buf) - pos, ", ");
|
| 301 |
+
|
| 302 |
+
char esc_word[2048];
|
| 303 |
+
json_escape(word_text ? word_text : "", esc_word, sizeof(esc_word));
|
| 304 |
+
|
| 305 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos,
|
| 306 |
+
"{\"text\": %s, \"confidence\": %.4f, \"bbox\": [%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f]}",
|
| 307 |
+
esc_word, word_conf,
|
| 308 |
+
word_bbox ? word_bbox->x1 : 0, word_bbox ? word_bbox->y1 : 0,
|
| 309 |
+
word_bbox ? word_bbox->x2 : 0, word_bbox ? word_bbox->y2 : 0,
|
| 310 |
+
word_bbox ? word_bbox->x3 : 0, word_bbox ? word_bbox->y3 : 0,
|
| 311 |
+
word_bbox ? word_bbox->x4 : 0, word_bbox ? word_bbox->y4 : 0);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos, "]}");
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
pos += snprintf(buf + pos, sizeof(buf) - pos, "]}");
|
| 318 |
+
|
| 319 |
+
/* Write JSON to stdout */
|
| 320 |
+
printf("%s\n", buf);
|
| 321 |
+
fflush(stdout);
|
| 322 |
+
|
| 323 |
+
/* Cleanup */
|
| 324 |
+
pReleaseOcrResult(result);
|
| 325 |
+
free(data);
|
| 326 |
+
pReleaseOcrProcessOptions(proc_opts);
|
| 327 |
+
pReleaseOcrPipeline(pipeline);
|
| 328 |
+
pReleaseOcrInitOptions(init_opts);
|
| 329 |
+
FreeLibrary(hmod);
|
| 330 |
+
|
| 331 |
+
return 0;
|
| 332 |
+
}
|
| 333 |
+
"""
|
| 334 |
+
|
| 335 |
+
# ─── Python Bridge ─────────────────────────────────────────────────────────
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
class WineBridge:
|
| 339 |
+
"""Bridge to run OneOCR DLL on Linux via Wine.
|
| 340 |
+
|
| 341 |
+
Strategy:
|
| 342 |
+
1. Cross-compile a minimal C loader (.exe) using MinGW
|
| 343 |
+
2. Run it via `wine64 oneocr_loader.exe <args>`
|
| 344 |
+
3. Parse JSON output
|
| 345 |
+
|
| 346 |
+
One-time setup on Linux:
|
| 347 |
+
sudo apt install wine64 mingw-w64 # Debian/Ubuntu
|
| 348 |
+
sudo dnf install wine mingw64-gcc # Fedora
|
| 349 |
+
sudo pacman -S wine mingw-w64-gcc # Arch
|
| 350 |
+
"""
|
| 351 |
+
|
| 352 |
+
def __init__(self, ocr_data_dir: str | Path | None = None):
|
| 353 |
+
self._base = Path(__file__).resolve().parent.parent
|
| 354 |
+
self._ocr_data = Path(ocr_data_dir) if ocr_data_dir else self._base / "ocr_data"
|
| 355 |
+
self._loader_exe = self._base / "tools" / "oneocr_loader.exe"
|
| 356 |
+
self._loader_c = self._base / "tools" / "oneocr_loader.c"
|
| 357 |
+
self._model_key = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 358 |
+
|
| 359 |
+
# Detect Wine
|
| 360 |
+
self._wine = self._find_wine()
|
| 361 |
+
self._mingw = self._find_mingw()
|
| 362 |
+
|
| 363 |
+
@staticmethod
|
| 364 |
+
def _find_wine() -> str | None:
|
| 365 |
+
"""Find Wine executable."""
|
| 366 |
+
for name in ("wine64", "wine"):
|
| 367 |
+
path = shutil.which(name)
|
| 368 |
+
if path:
|
| 369 |
+
return path
|
| 370 |
+
return None
|
| 371 |
+
|
| 372 |
+
@staticmethod
|
| 373 |
+
def _find_mingw() -> str | None:
|
| 374 |
+
"""Find MinGW cross-compiler."""
|
| 375 |
+
for name in ("x86_64-w64-mingw32-gcc", "x86_64-w64-mingw32-gcc-posix"):
|
| 376 |
+
path = shutil.which(name)
|
| 377 |
+
if path:
|
| 378 |
+
return path
|
| 379 |
+
return None
|
| 380 |
+
|
| 381 |
+
def check_requirements(self) -> dict[str, bool | str]:
|
| 382 |
+
"""Check if all requirements are met."""
|
| 383 |
+
checks = {
|
| 384 |
+
"platform": platform.system(),
|
| 385 |
+
"wine_found": self._wine is not None,
|
| 386 |
+
"wine_path": self._wine or "not found",
|
| 387 |
+
"mingw_found": self._mingw is not None,
|
| 388 |
+
"mingw_path": self._mingw or "not found",
|
| 389 |
+
"dll_exists": (self._ocr_data / "oneocr.dll").exists(),
|
| 390 |
+
"model_exists": (self._ocr_data / "oneocr.onemodel").exists(),
|
| 391 |
+
"onnxruntime_exists": (self._ocr_data / "onnxruntime.dll").exists(),
|
| 392 |
+
"loader_compiled": self._loader_exe.exists(),
|
| 393 |
+
}
|
| 394 |
+
checks["ready"] = all([
|
| 395 |
+
checks["wine_found"],
|
| 396 |
+
checks["dll_exists"],
|
| 397 |
+
checks["model_exists"],
|
| 398 |
+
checks["onnxruntime_exists"],
|
| 399 |
+
checks["loader_compiled"] or checks["mingw_found"],
|
| 400 |
+
])
|
| 401 |
+
return checks
|
| 402 |
+
|
| 403 |
+
def compile_loader(self) -> bool:
|
| 404 |
+
"""Cross-compile the C loader using MinGW."""
|
| 405 |
+
if not self._mingw:
|
| 406 |
+
raise RuntimeError(
|
| 407 |
+
"MinGW cross-compiler not found. Install it:\n"
|
| 408 |
+
" Ubuntu/Debian: sudo apt install mingw-w64\n"
|
| 409 |
+
" Fedora: sudo dnf install mingw64-gcc\n"
|
| 410 |
+
" Arch: sudo pacman -S mingw-w64-gcc"
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# Write C source
|
| 414 |
+
self._loader_c.write_text(WINE_LOADER_C, encoding="utf-8")
|
| 415 |
+
|
| 416 |
+
# Compile
|
| 417 |
+
result = subprocess.run(
|
| 418 |
+
[self._mingw, "-O2", "-o", str(self._loader_exe), str(self._loader_c)],
|
| 419 |
+
capture_output=True, text=True, timeout=30,
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
if result.returncode != 0:
|
| 423 |
+
raise RuntimeError(f"Compilation failed:\n{result.stderr}")
|
| 424 |
+
|
| 425 |
+
return self._loader_exe.exists()
|
| 426 |
+
|
| 427 |
+
def recognize_file(self, image_path: str | Path) -> dict:
|
| 428 |
+
"""Run OCR on an image file.
|
| 429 |
+
|
| 430 |
+
Args:
|
| 431 |
+
image_path: Path to image (PNG, JPEG, BMP).
|
| 432 |
+
|
| 433 |
+
Returns:
|
| 434 |
+
Dict with 'text_angle', 'lines' (each with 'text', 'bbox', 'words').
|
| 435 |
+
"""
|
| 436 |
+
image_path = Path(image_path)
|
| 437 |
+
|
| 438 |
+
if not self._loader_exe.exists():
|
| 439 |
+
self.compile_loader()
|
| 440 |
+
|
| 441 |
+
# Convert image to BMP for the C loader
|
| 442 |
+
bmp_path = self._to_bmp(image_path)
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
# Convert paths to Windows format for Wine
|
| 446 |
+
dll_dir = self._to_wine_path(self._ocr_data)
|
| 447 |
+
bmp_wine = self._to_wine_path(bmp_path)
|
| 448 |
+
key_hex = self._model_key.hex()
|
| 449 |
+
|
| 450 |
+
# Run via Wine
|
| 451 |
+
cmd = [self._wine, str(self._loader_exe), dll_dir, bmp_wine, key_hex]
|
| 452 |
+
|
| 453 |
+
result = subprocess.run(
|
| 454 |
+
cmd,
|
| 455 |
+
capture_output=True,
|
| 456 |
+
text=True,
|
| 457 |
+
timeout=60,
|
| 458 |
+
env={**os.environ, "WINEDEBUG": "-all"}, # suppress Wine debug
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
if result.returncode != 0:
|
| 462 |
+
raise RuntimeError(f"Wine loader failed:\n{result.stderr}")
|
| 463 |
+
|
| 464 |
+
# Parse JSON output
|
| 465 |
+
return json.loads(result.stdout.strip())
|
| 466 |
+
|
| 467 |
+
finally:
|
| 468 |
+
if bmp_path != image_path and bmp_path.exists():
|
| 469 |
+
bmp_path.unlink()
|
| 470 |
+
|
| 471 |
+
def recognize_pil(self, image: "Image.Image") -> dict:
|
| 472 |
+
"""Run OCR on a PIL Image."""
|
| 473 |
+
with tempfile.NamedTemporaryFile(suffix=".bmp", delete=False) as f:
|
| 474 |
+
image.convert("RGBA").save(f.name, format="BMP")
|
| 475 |
+
try:
|
| 476 |
+
return self.recognize_file(f.name)
|
| 477 |
+
finally:
|
| 478 |
+
os.unlink(f.name)
|
| 479 |
+
|
| 480 |
+
@staticmethod
|
| 481 |
+
def _to_bmp(path: Path) -> Path:
|
| 482 |
+
"""Convert image to BMP if needed."""
|
| 483 |
+
if path.suffix.lower() == ".bmp":
|
| 484 |
+
return path
|
| 485 |
+
|
| 486 |
+
from PIL import Image as PILImage
|
| 487 |
+
bmp_path = path.with_suffix(".bmp")
|
| 488 |
+
img = PILImage.open(path).convert("RGBA")
|
| 489 |
+
img.save(bmp_path, format="BMP")
|
| 490 |
+
return bmp_path
|
| 491 |
+
|
| 492 |
+
@staticmethod
|
| 493 |
+
def _to_wine_path(path: Path) -> str:
|
| 494 |
+
"""Convert Unix path to Wine Z: drive path."""
|
| 495 |
+
return "Z:" + str(path).replace("/", "\\")
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
# ─── Direct approach: Wine + ctypes (experimental) ─────────────────
|
| 499 |
+
|
| 500 |
+
class WineCtypesBridge:
|
| 501 |
+
"""Alternative: Use Wine's DLL loading directly from Python on Linux.
|
| 502 |
+
|
| 503 |
+
This uses a more experimental approach:
|
| 504 |
+
1. Set up Wine prefix with the DLLs
|
| 505 |
+
2. Use ctypes to load DLL through Wine's loader
|
| 506 |
+
|
| 507 |
+
This is EXPERIMENTAL and requires:
|
| 508 |
+
- winelib development headers
|
| 509 |
+
- Proper Wine 64-bit prefix
|
| 510 |
+
"""
|
| 511 |
+
pass # TODO: Implement if subprocess approach works
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
# ─── CLI ───────────────────────────────────────────────────────────
|
| 515 |
+
|
| 516 |
+
def main():
|
| 517 |
+
"""CLI entry point for testing Wine bridge."""
|
| 518 |
+
import argparse
|
| 519 |
+
|
| 520 |
+
parser = argparse.ArgumentParser(description="OneOCR Wine Bridge")
|
| 521 |
+
parser.add_argument("command", choices=["check", "compile", "run", "test"])
|
| 522 |
+
parser.add_argument("--image", "-i", help="Image path for run/test")
|
| 523 |
+
parser.add_argument("--ocr-data", help="Path to ocr_data directory")
|
| 524 |
+
args = parser.parse_args()
|
| 525 |
+
|
| 526 |
+
bridge = WineBridge(ocr_data_dir=args.ocr_data)
|
| 527 |
+
|
| 528 |
+
if args.command == "check":
|
| 529 |
+
checks = bridge.check_requirements()
|
| 530 |
+
print("Wine Bridge Requirements Check:")
|
| 531 |
+
for k, v in checks.items():
|
| 532 |
+
status = "✅" if v and v != "not found" else "❌"
|
| 533 |
+
print(f" {status} {k}: {v}")
|
| 534 |
+
|
| 535 |
+
elif args.command == "compile":
|
| 536 |
+
try:
|
| 537 |
+
bridge.compile_loader()
|
| 538 |
+
print("✅ Loader compiled successfully")
|
| 539 |
+
except RuntimeError as e:
|
| 540 |
+
print(f"❌ {e}")
|
| 541 |
+
|
| 542 |
+
elif args.command == "run":
|
| 543 |
+
if not args.image:
|
| 544 |
+
print("Error: --image required for run command")
|
| 545 |
+
return
|
| 546 |
+
result = bridge.recognize_file(args.image)
|
| 547 |
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
| 548 |
+
|
| 549 |
+
elif args.command == "test":
|
| 550 |
+
# Run on all test images
|
| 551 |
+
test_dir = Path(__file__).resolve().parent.parent / "working_space" / "input"
|
| 552 |
+
if not test_dir.exists():
|
| 553 |
+
print(f"Test directory not found: {test_dir}")
|
| 554 |
+
return
|
| 555 |
+
|
| 556 |
+
for img in sorted(test_dir.glob("*.png")):
|
| 557 |
+
try:
|
| 558 |
+
result = bridge.recognize_file(img)
|
| 559 |
+
lines = result.get("lines", [])
|
| 560 |
+
text = " | ".join(l["text"] for l in lines[:3])
|
| 561 |
+
print(f" ✅ {img.name}: {text[:80]}...")
|
| 562 |
+
except Exception as e:
|
| 563 |
+
print(f" ❌ {img.name}: {e}")
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
if __name__ == "__main__":
|
| 567 |
+
main()
|