OneOCR Dev
commited on
Commit
·
ce847d4
0
Parent(s):
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +31 -0
- .python-version +1 -0
- README.md +397 -0
- _archive/analysis/analyze_boundaries.py +91 -0
- _archive/analysis/analyze_crypto_log.py +95 -0
- _archive/analysis/analyze_decrypt.py +581 -0
- _archive/analysis/analyze_dx.py +137 -0
- _archive/analysis/analyze_extracted.py +145 -0
- _archive/analysis/analyze_model.py +64 -0
- _archive/analysis/decrypt_config.py +259 -0
- _archive/analysis/find_chunks.py +80 -0
- _archive/analysis/walk_payload.py +129 -0
- _archive/analyze_lm_features.py +110 -0
- _archive/analyze_models.py +82 -0
- _archive/analyze_pipeline.py +79 -0
- _archive/attempts/bcrypt_decrypt.py +423 -0
- _archive/attempts/create_test_image.py +21 -0
- _archive/attempts/decrypt_model.py +338 -0
- _archive/attempts/decrypt_with_static_iv.py +302 -0
- _archive/attempts/disasm_bcrypt_calls.py +143 -0
- _archive/attempts/disasm_crypto.py +156 -0
- _archive/attempts/disasm_full_cipher.py +138 -0
- _archive/attempts/disasm_proper.py +95 -0
- _archive/attempts/discover_key_derivation.py +126 -0
- _archive/attempts/dll_bcrypt_analysis.py +63 -0
- _archive/attempts/dll_crypto_analysis.py +183 -0
- _archive/attempts/extract_onnx.py +235 -0
- _archive/attempts/extract_strings.py +37 -0
- _archive/attempts/find_offset.py +44 -0
- _archive/attempts/frida_hook.py +328 -0
- _archive/attempts/frida_loader.py +50 -0
- _archive/attempts/peek_header.py +92 -0
- _archive/attempts/static_decrypt.py +289 -0
- _archive/attempts/verify_bcrypt.py +181 -0
- _archive/attempts/verify_key_derivation.py +98 -0
- _archive/attempts/verify_models.py +228 -0
- _archive/brainstorm.md +355 -0
- _archive/crack_config.py +84 -0
- _archive/crack_endian.py +65 -0
- _archive/debug_detector.py +80 -0
- _archive/decode_config.py +74 -0
- _archive/dedup.py +687 -0
- _archive/dedup_old.py +595 -0
- _archive/hooks/hook_decrypt.py +344 -0
- _archive/hooks/hook_full_bcrypt.py +441 -0
- _archive/hooks/hook_full_log.py +265 -0
- _archive/hooks/hook_hash.py +340 -0
- _archive/inspect_config_blob.py +80 -0
- _archive/inspect_custom_ops.py +39 -0
- _archive/inspect_graph_deep.py +60 -0
.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
build/
|
| 6 |
+
dist/
|
| 7 |
+
|
| 8 |
+
# Virtual environments
|
| 9 |
+
.venv/
|
| 10 |
+
|
| 11 |
+
# IDE
|
| 12 |
+
.vscode/
|
| 13 |
+
.idea/
|
| 14 |
+
|
| 15 |
+
# OS
|
| 16 |
+
.DS_Store
|
| 17 |
+
Thumbs.db
|
| 18 |
+
|
| 19 |
+
# Runtime data — large binary files (do NOT commit)
|
| 20 |
+
ocr_data/
|
| 21 |
+
oneocr_extracted/
|
| 22 |
+
|
| 23 |
+
# Working space outputs
|
| 24 |
+
working_space/output/
|
| 25 |
+
|
| 26 |
+
# UV lock (optional — regenerated by uv)
|
| 27 |
+
uv.lock
|
| 28 |
+
|
| 29 |
+
# Test images - too large for HF, stored locally
|
| 30 |
+
working_space/input/*.png
|
| 31 |
+
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
README.md
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OneOCR — Reverse-Engineered Cross-Platform OCR Pipeline
|
| 2 |
+
|
| 3 |
+
Full reimplementation of Microsoft's OneOCR engine from Windows Snipping Tool.
|
| 4 |
+
`.onemodel` encryption cracked, 34 ONNX models extracted, all custom ops replaced — runs on any OS with `onnxruntime`.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Project Status
|
| 9 |
+
|
| 10 |
+
| Component | Status | Details |
|
| 11 |
+
|---|---|---|
|
| 12 |
+
| **.onemodel decryption** | ✅ Done | AES-256-CFB128, static key + IV |
|
| 13 |
+
| **Model extraction** | ✅ Done | 34 ONNX models, 33 config files |
|
| 14 |
+
| **Custom op unlocking** | ✅ Done | `OneOCRFeatureExtract` → `Gemm`/`Conv1x1` |
|
| 15 |
+
| **ONNX pipeline** | ⚠️ Partial | **53% match rate** vs DLL (10/19 test images) |
|
| 16 |
+
| **DLL pipeline** | ✅ Done | ctypes wrapper, Windows only |
|
| 17 |
+
|
| 18 |
+
### Known ONNX Engine Limitations
|
| 19 |
+
|
| 20 |
+
The Python reimplementation achieves **53% match rate** against the original DLL. Below is a detailed breakdown of the remaining issues.
|
| 21 |
+
|
| 22 |
+
#### Issue 1: False FPN2 Detections (4 images)
|
| 23 |
+
**Images:** ocr_test 6, 13, 17, 18
|
| 24 |
+
**Symptom:** Panel edges / dialog borders detected as text
|
| 25 |
+
**Cause:** FPN2 (stride=4) sees edges as text-like textures
|
| 26 |
+
**DLL solution:** `SeglinkProposals` — advanced C++ post-processing with multi-stage NMS:
|
| 27 |
+
- `textline_hardnms_iou_threshold = 0.32`
|
| 28 |
+
- `textline_groupnms_span_ratio_threshold = 0.3`
|
| 29 |
+
- `ambiguous_nms_threshold = 0.3` / `ambiguous_overlap_threshold = 0.5`
|
| 30 |
+
- `K_of_detections` — per-scale detection limit
|
| 31 |
+
|
| 32 |
+
#### Issue 2: Missing Small Characters "..." (2 images)
|
| 33 |
+
**Images:** ocr_test 7, 14
|
| 34 |
+
**Symptom:** Three dots too small to detect
|
| 35 |
+
**Cause:** Minimum `min_component_pixels` and `min_area` thresholds insufficient
|
| 36 |
+
**DLL solution:** `SeglinkGroup` — groups neighboring segments into a single line
|
| 37 |
+
|
| 38 |
+
#### Issue 3: Character Recognition Errors (2 images)
|
| 39 |
+
**Images:** ocr_test 1, 15
|
| 40 |
+
**Symptom:** "iob" instead of "job", extra text from margins
|
| 41 |
+
**Cause:** Differences in text cropping/preprocessing
|
| 42 |
+
**DLL solution:** `BaseNormalizer` — sophisticated text line normalization
|
| 43 |
+
|
| 44 |
+
#### Issue 4: Large Images (test.png — 31.8% match)
|
| 45 |
+
**Symptom:** 55 of 74 lines detected, some cut off at edges
|
| 46 |
+
**Cause:** Adaptive Scaling — DLL scales at multiple levels
|
| 47 |
+
**DLL solution:** `AdaptiveScaling` with `AS_LARGE_TEXT_THRESHOLD`
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Architecture
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
Image (PIL / numpy)
|
| 55 |
+
│
|
| 56 |
+
▼
|
| 57 |
+
┌──────────────────────────────────┐
|
| 58 |
+
│ Detector (model_00) │ PixelLink FPN (fpn2/3/4)
|
| 59 |
+
│ BGR, mean subtraction │ stride = 4 / 8 / 16
|
| 60 |
+
│ → pixel_scores, link_scores │ 8-neighbor, Union-Find
|
| 61 |
+
│ → bounding quads (lines) │ minAreaRect + NMS (IoU 0.2)
|
| 62 |
+
└──────────────────────────────────┘
|
| 63 |
+
│
|
| 64 |
+
▼ for each detected line
|
| 65 |
+
┌──────────────────────────────────┐
|
| 66 |
+
│ Crop + padding (15%) │ Axis-aligned / perspective
|
| 67 |
+
│ ScriptID (model_01) │ 10 scripts: Latin, CJK, Arabic...
|
| 68 |
+
│ RGB / 255.0, height=60px │ HW/PC classification, flip detection
|
| 69 |
+
└──────────────────────────────────┘
|
| 70 |
+
│
|
| 71 |
+
▼ per script
|
| 72 |
+
┌──────────────────────────────────┐
|
| 73 |
+
│ Recognizer (model_02–10) │ DynamicQuantizeLSTM + CTC
|
| 74 |
+
│ Per-script character maps │ Greedy decode with per-char confidence
|
| 75 |
+
│ → text + word confidences │ Word splitting on spaces
|
| 76 |
+
└──────────────────────────────────┘
|
| 77 |
+
│
|
| 78 |
+
▼
|
| 79 |
+
┌──────────────────────────────────┐
|
| 80 |
+
│ Line grouping & sorting │ Y-overlap clustering
|
| 81 |
+
│ Per-word bounding boxes │ Proportional quad interpolation
|
| 82 |
+
│ Text angle estimation │ Median of top-edge angles
|
| 83 |
+
└──────────────────────────────────┘
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### Model Registry (34 models)
|
| 87 |
+
|
| 88 |
+
| Index | Role | Script | Custom Op | Status |
|
| 89 |
+
|-------|------|--------|-----------|--------|
|
| 90 |
+
| 0 | Detector | Universal | `QLinearSigmoid` | ✅ Works |
|
| 91 |
+
| 1 | ScriptID | Universal | — | ✅ Works |
|
| 92 |
+
| 2–10 | Recognizers | Latin/CJK/Arabic/Cyrillic/Devanagari/Greek/Hebrew/Tamil/Thai | `DynamicQuantizeLSTM` | ✅ Work |
|
| 93 |
+
| 11–21 | LangSm (confidence) | Per-script | `OneOCRFeatureExtract` → **Gemm** | ✅ Unlocked |
|
| 94 |
+
| 22–32 | LangMd (confidence) | Per-script | `OneOCRFeatureExtract` → **Gemm** | ✅ Unlocked |
|
| 95 |
+
| 33 | LineLayout | Universal | `OneOCRFeatureExtract` → **Conv1x1** | ✅ Unlocked |
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## Quick Start
|
| 100 |
+
|
| 101 |
+
### Requirements
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
pip install onnxruntime numpy opencv-python-headless Pillow pycryptodome onnx
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
Or with `uv`:
|
| 108 |
+
```bash
|
| 109 |
+
uv sync --extra extract
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### Model Extraction (one-time)
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
# Full pipeline: decrypt → extract → unlock → verify
|
| 116 |
+
python tools/extract_pipeline.py ocr_data/oneocr.onemodel
|
| 117 |
+
|
| 118 |
+
# Verify existing models only
|
| 119 |
+
python tools/extract_pipeline.py --verify-only
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### Usage
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
from ocr.engine_onnx import OcrEngineOnnx
|
| 126 |
+
from PIL import Image
|
| 127 |
+
|
| 128 |
+
engine = OcrEngineOnnx()
|
| 129 |
+
result = engine.recognize_pil(Image.open("screenshot.png"))
|
| 130 |
+
|
| 131 |
+
print(result.text) # "Hello World"
|
| 132 |
+
print(result.average_confidence) # 0.975
|
| 133 |
+
print(result.text_angle) # 0.0
|
| 134 |
+
|
| 135 |
+
for line in result.lines:
|
| 136 |
+
for word in line.words:
|
| 137 |
+
print(f" '{word.text}' conf={word.confidence:.0%} "
|
| 138 |
+
f"bbox=({word.bounding_rect.x1:.0f},{word.bounding_rect.y1:.0f})")
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### API Reference
|
| 142 |
+
|
| 143 |
+
```python
|
| 144 |
+
engine = OcrEngineOnnx(
|
| 145 |
+
models_dir="path/to/onnx_models", # optional
|
| 146 |
+
config_dir="path/to/config_data", # optional
|
| 147 |
+
providers=["CUDAExecutionProvider"], # optional (default: CPU)
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Input formats:
|
| 151 |
+
result = engine.recognize_pil(pil_image) # PIL Image
|
| 152 |
+
result = engine.recognize_numpy(rgb_array) # numpy (H,W,3) RGB
|
| 153 |
+
result = engine.recognize_bytes(png_bytes) # raw bytes (PNG/JPEG)
|
| 154 |
+
|
| 155 |
+
# Result:
|
| 156 |
+
result.text # str — full recognized text
|
| 157 |
+
result.text_angle # float — detected rotation angle
|
| 158 |
+
result.lines # list[OcrLine]
|
| 159 |
+
result.average_confidence # float — overall confidence 0-1
|
| 160 |
+
result.error # str | None — error message
|
| 161 |
+
|
| 162 |
+
# Per-word:
|
| 163 |
+
word.text # str
|
| 164 |
+
word.confidence # float — CTC confidence per word
|
| 165 |
+
word.bounding_rect # BoundingRect (x1,y1...x4,y4 quadrilateral)
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## Project Structure
|
| 171 |
+
|
| 172 |
+
```
|
| 173 |
+
ONEOCR/
|
| 174 |
+
├── main.py # Usage example (both engines)
|
| 175 |
+
├── pyproject.toml # Project config & dependencies
|
| 176 |
+
├── README.md # This documentation
|
| 177 |
+
├── .gitignore
|
| 178 |
+
│
|
| 179 |
+
├── ocr/ # Core OCR package
|
| 180 |
+
│ ├── __init__.py # Exports OcrEngine, OcrEngineOnnx, models
|
| 181 |
+
│ ├── engine.py # DLL wrapper (Windows only, 374 lines)
|
| 182 |
+
│ ├── engine_onnx.py # ONNX engine (cross-platform, ~1100 lines)
|
| 183 |
+
│ └── models.py # Data models: OcrResult, OcrLine, OcrWord
|
| 184 |
+
│
|
| 185 |
+
├── tools/ # Utilities
|
| 186 |
+
│ ├── extract_pipeline.py # Extraction pipeline (decrypt→extract→unlock→verify)
|
| 187 |
+
│ ├── visualize_ocr.py # OCR result visualization with bounding boxes
|
| 188 |
+
│ └── test_quick.py # Quick OCR test on images
|
| 189 |
+
│
|
| 190 |
+
├── ocr_data/ # Runtime data (DO NOT commit)
|
| 191 |
+
│ ├── oneocr.dll # Original DLL (Windows only)
|
| 192 |
+
│ ├── oneocr.onemodel # Encrypted model container
|
| 193 |
+
│ └── onnxruntime.dll # ONNX Runtime DLL
|
| 194 |
+
│
|
| 195 |
+
├── oneocr_extracted/ # Extracted models (auto-generated)
|
| 196 |
+
│ ├── onnx_models/ # 34 raw ONNX (models 11-33 have custom ops)
|
| 197 |
+
│ ├── onnx_models_unlocked/ # 23 unlocked (models 11-33, standard ONNX ops)
|
| 198 |
+
│ └── config_data/ # Character maps, rnn_info, manifest, configs
|
| 199 |
+
│
|
| 200 |
+
├── working_space/ # Test images
|
| 201 |
+
│ └── input/ # 19 test images
|
| 202 |
+
│
|
| 203 |
+
└── _archive/ # Archive — RE scripts, analyses, prototypes
|
| 204 |
+
├── temp/re_output/ # DLL reverse engineering results
|
| 205 |
+
├── attempts/ # Decryption attempts
|
| 206 |
+
├── analysis/ # Cryptographic analyses
|
| 207 |
+
└── hooks/ # Frida hooks
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
## Technical Details
|
| 213 |
+
|
| 214 |
+
### .onemodel Encryption
|
| 215 |
+
|
| 216 |
+
| Element | Value |
|
| 217 |
+
|---------|-------|
|
| 218 |
+
| Algorithm | AES-256-CFB128 |
|
| 219 |
+
| Master Key | `kj)TGtrK>f]b[Piow.gU+nC@s""""""4` (32B) |
|
| 220 |
+
| IV | `Copyright @ OneO` (16B) |
|
| 221 |
+
| DX key | `SHA256(master_key + file[8:24])` |
|
| 222 |
+
| Config key | `SHA256(DX[48:64] + DX[32:48])` |
|
| 223 |
+
| Chunk key | `SHA256(chunk_header[16:32] + chunk_header[0:16])` |
|
| 224 |
+
|
| 225 |
+
### OneOCRFeatureExtract — Cracked Custom Op
|
| 226 |
+
|
| 227 |
+
Proprietary op (domain `com.microsoft.oneocr`) stores weights as a **big-endian float32** blob in a STRING tensor.
|
| 228 |
+
|
| 229 |
+
**Models 11–32** (21→50 features):
|
| 230 |
+
```
|
| 231 |
+
config_blob (4492B, big-endian float32):
|
| 232 |
+
W[21×50] = 1050 floats (weight matrix)
|
| 233 |
+
b[50] = 50 floats (bias)
|
| 234 |
+
metadata = 23 floats (dimensions [21, 50, 2], flags, calibration)
|
| 235 |
+
|
| 236 |
+
Replacement: Gemm(input, W^T, b)
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
**Model 33** (256→16 channels):
|
| 240 |
+
```
|
| 241 |
+
config_blob (16548B, big-endian float32):
|
| 242 |
+
W[256×16] = 4096 floats (convolution weights)
|
| 243 |
+
b[16] = 16 floats (bias)
|
| 244 |
+
metadata = 25 floats (dimensions [256, 16], flags)
|
| 245 |
+
|
| 246 |
+
Replacement: Conv(input, W[in,out].T → [16,256,1,1], b, kernel=1x1)
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
### Detector Configuration (from DLL protobuf manifest)
|
| 250 |
+
|
| 251 |
+
```
|
| 252 |
+
segment_conf_threshold: 0.7 (field 8)
|
| 253 |
+
textline_conf_threshold per-FPN: P2=0.7, P3=0.8, P4=0.8 (field 9)
|
| 254 |
+
textline_nms_threshold: 0.2 (field 10)
|
| 255 |
+
textline_overlap_threshold: 0.4 (field 11)
|
| 256 |
+
text_confidence_threshold: 0.8 (field 13)
|
| 257 |
+
ambiguous_nms_threshold: 0.3 (field 15)
|
| 258 |
+
ambiguous_overlap_threshold: 0.5 (field 16)
|
| 259 |
+
ambiguous_save_threshold: 0.4 (field 17)
|
| 260 |
+
textline_hardnms_iou_threshold: 0.32 (field 20)
|
| 261 |
+
textline_groupnms_span_ratio_threshold: 0.3 (field 21)
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
### PixelLink Detector
|
| 265 |
+
|
| 266 |
+
- **FPN levels**: fpn2 (stride=4), fpn3 (stride=8), fpn4 (stride=16)
|
| 267 |
+
- **Outputs per level**: `scores_hori/vert` (pixel text probability), `link_scores_hori/vert` (8-neighbor connectivity), `bbox_deltas_hori/vert` (corner offsets)
|
| 268 |
+
- **Post-processing**: Threshold pixels → Union-Find connected components → bbox regression → NMS
|
| 269 |
+
- **Detects TEXT LINES** — word splitting comes from the recognizer
|
| 270 |
+
|
| 271 |
+
### CTC Recognition
|
| 272 |
+
|
| 273 |
+
- Target height: 60px, aspect ratio preserved
|
| 274 |
+
- Input: RGB / 255.0, NCHW format
|
| 275 |
+
- Output: log-softmax [T, 1, N_chars]
|
| 276 |
+
- Decoding: greedy argmax with repeat merging + blank removal
|
| 277 |
+
- Per-character confidence via `exp(max_logprob)`
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
## DLL Reverse Engineering — Results & Materials
|
| 282 |
+
|
| 283 |
+
### DLL Source Structure (from debug symbols)
|
| 284 |
+
|
| 285 |
+
```
|
| 286 |
+
C:\__w\1\s\CoreEngine\Native\
|
| 287 |
+
├── TextDetector/
|
| 288 |
+
│ ├── AdaptiveScaling ← multi-level image scaling
|
| 289 |
+
│ ├── SeglinkProposal ← KEY: detection post-processing
|
| 290 |
+
│ ├── SeglinkGroup.h ← segment grouping into lines
|
| 291 |
+
│ ├── TextLinePolygon ← precise text contouring
|
| 292 |
+
│ ├── RelationRCNNRpn2 ← relational region proposal network
|
| 293 |
+
│ ├── BaseRCNN, DQDETR ← alternative detectors
|
| 294 |
+
│ ├── PolyFitting ← polynomial fitting
|
| 295 |
+
│ └── BarcodePolygon ← barcode detection
|
| 296 |
+
│
|
| 297 |
+
├── TextRecognizer/
|
| 298 |
+
│ ├── TextLineRecognizerImpl ← main CTC implementation
|
| 299 |
+
│ ├── ArgMaxDecoder ← CTC decoding
|
| 300 |
+
│ ├── ConfidenceProcessor ← confidence models (models 11-21)
|
| 301 |
+
│ ├── RejectionProcessor ← rejection models (models 22-32)
|
| 302 |
+
│ ├── DbLstm ← dynamic batch LSTM
|
| 303 |
+
│ └── CharacterMap/ ← per-script character maps
|
| 304 |
+
│
|
| 305 |
+
├── TextAnalyzer/
|
| 306 |
+
│ ├── TextAnalyzerImpl ← text layout analysis
|
| 307 |
+
│ └── AuxMltClsClassifier ← auxiliary classifier
|
| 308 |
+
│
|
| 309 |
+
├── TextNormalizer/
|
| 310 |
+
│ ├── BaseNormalizer ← text line normalization
|
| 311 |
+
│ └── ConcatTextLines ← line concatenation
|
| 312 |
+
│
|
| 313 |
+
├── TextPipeline/
|
| 314 |
+
│ ├── TextPipelineDevImpl ← main pipeline
|
| 315 |
+
│ └── FilterXY ← position-based filtering
|
| 316 |
+
│
|
| 317 |
+
├── CustomOps/onnxruntime/
|
| 318 |
+
│ ├── SeglinkProposalsOp ← ONNX op (NOT in our models)
|
| 319 |
+
│ ├── XYSeglinkProposalsOp ← XY variant
|
| 320 |
+
│ └── FeatureExtractOp ← = Gemm / Conv1x1
|
| 321 |
+
│
|
| 322 |
+
├── ModelParser/
|
| 323 |
+
│ ├── ModelParser ← .onemodel parsing
|
| 324 |
+
│ └── Crypto ← AES-256-CFB128
|
| 325 |
+
│
|
| 326 |
+
└── Common/
|
| 327 |
+
├── ImageUtility ← image conversion
|
| 328 |
+
└── ImageFeature ← image features
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
### RE Materials
|
| 332 |
+
|
| 333 |
+
Reverse engineering results in `_archive/temp/re_output/`:
|
| 334 |
+
- `03_oneocr_classes.txt` — 186 C++ classes
|
| 335 |
+
- `06_config_strings.txt` — 429 config strings
|
| 336 |
+
- `15_manifest_decoded.txt` — 1182 lines of decoded protobuf manifest
|
| 337 |
+
- `09_constants.txt` — 42 float + 14 double constants (800.0, 0.7, 0.8, 0.92...)
|
| 338 |
+
- `10_disassembly.txt` — disassembly of key exports
|
| 339 |
+
|
| 340 |
+
---
|
| 341 |
+
|
| 342 |
+
## For Future Developers — Roadmap
|
| 343 |
+
|
| 344 |
+
### Priority 1: SeglinkProposals (hardest, highest impact)
|
| 345 |
+
|
| 346 |
+
This is the key C++ post-processing in the DLL that is NOT part of the ONNX models.
|
| 347 |
+
Responsible for ~80% of the differences between the DLL and our implementation.
|
| 348 |
+
|
| 349 |
+
**What it does:**
|
| 350 |
+
1. Takes raw pixel_scores + link_scores + bbox_deltas from all 3 FPN levels
|
| 351 |
+
2. Groups segments into lines (SeglinkGroup) — merges neighboring small components into a single line
|
| 352 |
+
3. Multi-stage NMS: textline_nms → hardnms → ambiguous_nms → groupnms
|
| 353 |
+
4. Confidence filtering with `text_confidence_threshold = 0.8`
|
| 354 |
+
5. `K_of_detections` — detection count limit
|
| 355 |
+
|
| 356 |
+
**Where to look:**
|
| 357 |
+
- `_archive/temp/re_output/06_config_strings.txt` — parameter names
|
| 358 |
+
- `_archive/temp/re_output/15_manifest_decoded.txt` — parameter values
|
| 359 |
+
- `SeglinkProposal` class in DLL — ~2000 lines of C++
|
| 360 |
+
|
| 361 |
+
**Approach:**
|
| 362 |
+
- Decompile `SeglinkProposal::Process` with IDA Pro / Ghidra
|
| 363 |
+
- Alternatively: black-box testing of different NMS configurations
|
| 364 |
+
|
| 365 |
+
### Priority 2: AdaptiveScaling
|
| 366 |
+
|
| 367 |
+
The DLL dynamically scales images based on text size.
|
| 368 |
+
|
| 369 |
+
**Parameters:**
|
| 370 |
+
- `AS_LARGE_TEXT_THRESHOLD` — large text threshold
|
| 371 |
+
- Multi-scale: DLL can run the detector at multiple scales
|
| 372 |
+
|
| 373 |
+
### Priority 3: BaseNormalizer
|
| 374 |
+
|
| 375 |
+
The DLL normalizes text crops before recognition more effectively than our simple resize.
|
| 376 |
+
|
| 377 |
+
### Priority 4: Confidence/Rejection Models (11-32)
|
| 378 |
+
|
| 379 |
+
The DLL uses models 11-32 to filter results — we skip them. Integration could improve
|
| 380 |
+
precision by removing false detections.
|
| 381 |
+
|
| 382 |
+
---
|
| 383 |
+
|
| 384 |
+
## Performance
|
| 385 |
+
|
| 386 |
+
| Operation | ONNX (CPU) | DLL | Notes |
|
| 387 |
+
|---|---|---|---|
|
| 388 |
+
| Detection (PixelLink) | ~50-200ms | ~15-50ms | Model inference + post-processing |
|
| 389 |
+
| ScriptID | ~5ms | ~3ms | Single forward pass |
|
| 390 |
+
| Recognition (CTC) | ~30ms/line | ~10ms/line | Per-script LSTM |
|
| 391 |
+
| Full pipeline | ~300-1000ms | ~15-135ms | Depends on line count |
|
| 392 |
+
|
| 393 |
+
---
|
| 394 |
+
|
| 395 |
+
## License
|
| 396 |
+
|
| 397 |
+
For research and educational purposes only.
|
_archive/analysis/analyze_boundaries.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze exact chunk boundary structure in the .onemodel file."""
|
| 2 |
+
import struct, json
|
| 3 |
+
|
| 4 |
+
with open("ocr_data/oneocr.onemodel", "rb") as f:
|
| 5 |
+
fdata = f.read()
|
| 6 |
+
log = json.load(open("temp/crypto_log.json"))
|
| 7 |
+
|
| 8 |
+
sha256s = [op for op in log if op["op"] == "sha256"]
|
| 9 |
+
sha_map = {s["output"]: s["input"] for s in sha256s}
|
| 10 |
+
decrypts = [op for op in log if op["op"] == "decrypt"]
|
| 11 |
+
|
| 12 |
+
# Get info for first few payload chunks
|
| 13 |
+
def get_chunk_info(dec_idx):
|
| 14 |
+
d = decrypts[dec_idx]
|
| 15 |
+
sha_inp = bytes.fromhex(sha_map[d["aes_key"]])
|
| 16 |
+
s1, s2 = struct.unpack_from("<QQ", sha_inp, 0)
|
| 17 |
+
chk = sha_inp[16:32]
|
| 18 |
+
chk_pos = fdata.find(chk)
|
| 19 |
+
return {
|
| 20 |
+
"dec_idx": dec_idx,
|
| 21 |
+
"enc_size": d["input_size"],
|
| 22 |
+
"size1": s1,
|
| 23 |
+
"size2": s2,
|
| 24 |
+
"chk": chk,
|
| 25 |
+
"chk_pos": chk_pos,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
# Focus on first few consecutive large chunks
|
| 29 |
+
# From the sorted output, the order in file is: dec#02, dec#03, dec#06, dec#11, dec#16, dec#23, ...
|
| 30 |
+
chunks_in_order = [2, 3, 6, 11, 16, 23, 28, 33]
|
| 31 |
+
infos = [get_chunk_info(i) for i in chunks_in_order]
|
| 32 |
+
|
| 33 |
+
print("=== Chunk boundary analysis ===\n")
|
| 34 |
+
for i, info in enumerate(infos):
|
| 35 |
+
print(f"dec#{info['dec_idx']:02d}: chk_pos={info['chk_pos']}, size1={info['size1']}, enc_size={info['enc_size']}")
|
| 36 |
+
|
| 37 |
+
if i > 0:
|
| 38 |
+
prev = infos[i-1]
|
| 39 |
+
# Hypothesis: on-disk encrypted data = size1 + 8 (data_size + container_header)
|
| 40 |
+
prev_data_start = prev['chk_pos'] + 32
|
| 41 |
+
prev_on_disk = prev['size1'] + 8
|
| 42 |
+
expected_next_chk = prev_data_start + prev_on_disk
|
| 43 |
+
actual_next_chk = info['chk_pos']
|
| 44 |
+
delta = actual_next_chk - expected_next_chk
|
| 45 |
+
print(f" Expected chk_pos: {expected_next_chk}, actual: {actual_next_chk}, delta: {delta}")
|
| 46 |
+
|
| 47 |
+
# Now figure out the EXACT header structure
|
| 48 |
+
print("\n=== Bytes around first few chunk boundaries ===\n")
|
| 49 |
+
|
| 50 |
+
# Between DX and first chunk
|
| 51 |
+
dx_end = 24 + 22624 # = 22648
|
| 52 |
+
print(f"--- DX end ({dx_end}) to first chunk ---")
|
| 53 |
+
for off in range(dx_end, infos[0]['chk_pos'] + 48, 8):
|
| 54 |
+
raw = fdata[off:off+8]
|
| 55 |
+
val = struct.unpack_from("<Q", raw)[0] if len(raw) == 8 else 0
|
| 56 |
+
print(f" {off:>8}: {raw.hex()} (uint64={val})")
|
| 57 |
+
|
| 58 |
+
# Between chunk 0 and chunk 1
|
| 59 |
+
c0 = infos[0]
|
| 60 |
+
c1 = infos[1]
|
| 61 |
+
# data starts at chk_pos + 32, on-disk size is approximately size1+8 or enc_size
|
| 62 |
+
# Let's look at bytes around where the boundary should be
|
| 63 |
+
c0_data_start = c0['chk_pos'] + 32
|
| 64 |
+
c0_approx_end = c0_data_start + c0['size1'] + 8
|
| 65 |
+
print(f"\n--- End of dec#{c0['dec_idx']:02d} / Start of dec#{c1['dec_idx']:02d} ---")
|
| 66 |
+
print(f" c0 data_start: {c0_data_start}")
|
| 67 |
+
print(f" c0 size1+8: {c0['size1']+8}")
|
| 68 |
+
print(f" c0 approx end: {c0_approx_end}")
|
| 69 |
+
print(f" c1 chk_pos: {c1['chk_pos']}")
|
| 70 |
+
|
| 71 |
+
for off in range(c0_approx_end - 16, c1['chk_pos'] + 48, 8):
|
| 72 |
+
raw = fdata[off:off+8]
|
| 73 |
+
val = struct.unpack_from("<Q", raw)[0] if len(raw) == 8 else 0
|
| 74 |
+
ascii_s = ''.join(chr(b) if 32 <= b < 127 else '.' for b in raw)
|
| 75 |
+
print(f" {off:>8}: {raw.hex()} val={val:<15d} {ascii_s}")
|
| 76 |
+
|
| 77 |
+
# Check file header
|
| 78 |
+
header_size = struct.unpack_from("<Q", fdata, 0)[0]
|
| 79 |
+
print(f"\nFile header uint64: {header_size}")
|
| 80 |
+
print(f" = file[0:8] as uint64 LE")
|
| 81 |
+
|
| 82 |
+
# What if it's NOT a uint64 but two uint32?
|
| 83 |
+
h1, h2 = struct.unpack_from("<II", fdata, 0)
|
| 84 |
+
print(f" As two uint32: ({h1}, {h2})")
|
| 85 |
+
|
| 86 |
+
# file[0:24] detailed view
|
| 87 |
+
print("\nFile header [0:24]:")
|
| 88 |
+
for off in range(0, 24, 8):
|
| 89 |
+
raw = fdata[off:off+8]
|
| 90 |
+
val = struct.unpack_from("<Q", raw)[0]
|
| 91 |
+
print(f" {off:>3}: {raw.hex()} uint64={val}")
|
_archive/analysis/analyze_crypto_log.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze crypto_log.json to understand decrypt sequence and chunk mapping."""
|
| 2 |
+
import json
|
| 3 |
+
import struct
|
| 4 |
+
|
| 5 |
+
with open("temp/crypto_log.json") as f:
|
| 6 |
+
log = json.load(f)
|
| 7 |
+
|
| 8 |
+
decrypts = [op for op in log if op["op"] == "decrypt"]
|
| 9 |
+
sha256s = [op for op in log if op["op"] == "sha256"]
|
| 10 |
+
encrypts = [op for op in log if op["op"] == "encrypt"]
|
| 11 |
+
|
| 12 |
+
print(f"Total ops: {len(log)} (sha256={len(sha256s)}, decrypt={len(decrypts)}, encrypt={len(encrypts)})")
|
| 13 |
+
|
| 14 |
+
# Build SHA256 output -> input mapping
|
| 15 |
+
sha_map = {} # output_hex -> input_hex
|
| 16 |
+
for s in sha256s:
|
| 17 |
+
sha_map[s["output"]] = s["input"]
|
| 18 |
+
|
| 19 |
+
# Pair each decrypt with its SHA256 key derivation
|
| 20 |
+
print("\n=== Decrypt operations with key derivation ===")
|
| 21 |
+
for i, d in enumerate(decrypts):
|
| 22 |
+
key = d["aes_key"]
|
| 23 |
+
sha_input_hex = sha_map.get(key, "UNKNOWN")
|
| 24 |
+
sha_input = bytes.fromhex(sha_input_hex) if sha_input_hex != "UNKNOWN" else b""
|
| 25 |
+
|
| 26 |
+
if len(sha_input) == 48:
|
| 27 |
+
desc = "DX_KEY (master+file[8:24])"
|
| 28 |
+
elif len(sha_input) == 32:
|
| 29 |
+
s1, s2 = struct.unpack_from("<QQ", sha_input, 0)
|
| 30 |
+
chk = sha_input[16:32].hex()[:16] + "..."
|
| 31 |
+
desc = f"CHK sizes=({s1},{s2}) chk={chk}"
|
| 32 |
+
elif len(sha_input) == 16:
|
| 33 |
+
s1, s2 = struct.unpack_from("<QQ", sha_input, 0)
|
| 34 |
+
desc = f"NOCHK sizes=({s1},{s2})"
|
| 35 |
+
else:
|
| 36 |
+
desc = f"len={len(sha_input)}"
|
| 37 |
+
|
| 38 |
+
first = d["first_bytes"][:32]
|
| 39 |
+
print(f" dec#{i:02d}: size={d['input_size']:>8}B {desc:50s} out={first}")
|
| 40 |
+
|
| 41 |
+
# Now search for plaintext first_bytes in decrypted DX to find embedded chunks
|
| 42 |
+
dx = open("temp/dx_index_decrypted.bin", "rb").read()
|
| 43 |
+
fdata = open("ocr_data/oneocr.onemodel", "rb").read()
|
| 44 |
+
|
| 45 |
+
print("\n=== Locating encrypted data ===")
|
| 46 |
+
for i, d in enumerate(decrypts):
|
| 47 |
+
size = d["input_size"]
|
| 48 |
+
first = bytes.fromhex(d["first_bytes"][:32])
|
| 49 |
+
|
| 50 |
+
# Search in decrypted DX for the plaintext (this was decrypted in-place)
|
| 51 |
+
# But we need the CIPHERTEXT, which is in the original file (encrypted DX) or payload
|
| 52 |
+
|
| 53 |
+
# For chunks embedded in DX: ciphertext is at file offset 24 + dx_offset
|
| 54 |
+
# For chunks in payload: ciphertext is at some file offset after 22684
|
| 55 |
+
|
| 56 |
+
# Let's find plaintext in decrypted DX
|
| 57 |
+
dx_pos = dx.find(first)
|
| 58 |
+
|
| 59 |
+
# Find ciphertext (first 16 bytes from hook_decrypt dumps)
|
| 60 |
+
# We don't have ciphertext in logs, but we know:
|
| 61 |
+
# - DX encrypted data is at file[24:24+22624]
|
| 62 |
+
# - Payload data is after file[22684]
|
| 63 |
+
|
| 64 |
+
if i == 0:
|
| 65 |
+
loc = "DX index itself at file[24:]"
|
| 66 |
+
elif dx_pos >= 0:
|
| 67 |
+
loc = f"embedded in DX at dx_offset={dx_pos} (file_off={24+dx_pos})"
|
| 68 |
+
else:
|
| 69 |
+
loc = "payload (after file[22684])"
|
| 70 |
+
|
| 71 |
+
print(f" dec#{i:02d}: size={size:>8}B {loc}")
|
| 72 |
+
|
| 73 |
+
# Scan DX for all uint64 pairs where second = first + 24
|
| 74 |
+
print("\n=== All size-pair patterns in DX (s2 = s1 + 24) ===")
|
| 75 |
+
pairs = []
|
| 76 |
+
for off in range(0, len(dx) - 16):
|
| 77 |
+
s1, s2 = struct.unpack_from("<QQ", dx, off)
|
| 78 |
+
if s2 == s1 + 24 and 0 < s1 < 100_000_000 and s1 > 10:
|
| 79 |
+
pairs.append((off, s1, s2))
|
| 80 |
+
print(f"Found {len(pairs)} size pairs")
|
| 81 |
+
# Deduplicate overlapping pairs
|
| 82 |
+
filtered = []
|
| 83 |
+
for p in pairs:
|
| 84 |
+
if not filtered or p[0] >= filtered[-1][0] + 16:
|
| 85 |
+
filtered.append(p)
|
| 86 |
+
print(f"After dedup: {len(filtered)} pairs")
|
| 87 |
+
for off, s1, s2 in filtered:
|
| 88 |
+
# Check if there's a 16-byte checksum before this pair
|
| 89 |
+
has_chk = False
|
| 90 |
+
if off >= 16:
|
| 91 |
+
# Check if the 16 bytes before could be a checksum (non-trivial bytes)
|
| 92 |
+
potential_chk = dx[off-16:off]
|
| 93 |
+
non_zero = sum(1 for b in potential_chk if b != 0)
|
| 94 |
+
has_chk = non_zero > 8 # At least 8 non-zero bytes
|
| 95 |
+
print(f" offset={off:>5} (0x{off:04x}): sizes=({s1}, {s2}) chk_before={'YES' if has_chk else 'no'}")
|
_archive/analysis/analyze_decrypt.py
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OneOCR .onemodel file analysis and decryption attempt.
|
| 3 |
+
|
| 4 |
+
Known facts:
|
| 5 |
+
- AES-256-CFB via Windows BCrypt CNG API
|
| 6 |
+
- SHA256 used somewhere in the process
|
| 7 |
+
- Key: kj)TGtrK>f]b[Piow.gU+nC@s""""""4 (32 ASCII bytes = 256 bits)
|
| 8 |
+
- After decryption → decompression (zlib/lz4/etc.)
|
| 9 |
+
- Error on wrong key: meta->magic_number == MAGIC_NUMBER (0 vs. 1)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import struct
|
| 13 |
+
import hashlib
|
| 14 |
+
import zlib
|
| 15 |
+
import os
|
| 16 |
+
from collections import Counter
|
| 17 |
+
from typing import Optional
|
| 18 |
+
|
| 19 |
+
# ── Try to import crypto libraries ──
|
| 20 |
+
try:
|
| 21 |
+
from Crypto.Cipher import AES as PyCryptoAES
|
| 22 |
+
HAS_PYCRYPTODOME = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
HAS_PYCRYPTODOME = False
|
| 25 |
+
print("[WARN] PyCryptodome not available, install with: pip install pycryptodome")
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
| 29 |
+
from cryptography.hazmat.backends import default_backend
|
| 30 |
+
HAS_CRYPTOGRAPHY = True
|
| 31 |
+
except ImportError:
|
| 32 |
+
HAS_CRYPTOGRAPHY = False
|
| 33 |
+
print("[WARN] cryptography not available, install with: pip install cryptography")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ═══════════════════════════════════════════════════════════════
|
| 37 |
+
# CONFIGURATION
|
| 38 |
+
# ═══════════════════════════════════════════════════════════════
|
| 39 |
+
|
| 40 |
+
MODEL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
|
| 41 |
+
|
| 42 |
+
# The key as raw bytes (32 bytes = 256 bits for AES-256)
|
| 43 |
+
KEY_RAW = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 44 |
+
assert len(KEY_RAW) == 32, f"Key must be 32 bytes, got {len(KEY_RAW)}"
|
| 45 |
+
|
| 46 |
+
# SHA256 of the key (another possible key derivation)
|
| 47 |
+
KEY_SHA256 = hashlib.sha256(KEY_RAW).digest()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ═══════════════════════════════════════════════════════════════
|
| 51 |
+
# HELPER FUNCTIONS
|
| 52 |
+
# ═══════════════════════════════════════════════════════════════
|
| 53 |
+
|
| 54 |
+
def hex_dump(data: bytes, offset: int = 0, max_lines: int = 32) -> str:
|
| 55 |
+
"""Format bytes as hex dump with ASCII column."""
|
| 56 |
+
lines = []
|
| 57 |
+
for i in range(0, min(len(data), max_lines * 16), 16):
|
| 58 |
+
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
|
| 59 |
+
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
|
| 60 |
+
lines.append(f" {offset+i:08x}: {hex_part:<48s} {ascii_part}")
|
| 61 |
+
return "\n".join(lines)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def entropy(data: bytes) -> float:
|
| 65 |
+
"""Calculate Shannon entropy (0-8 bits per byte)."""
|
| 66 |
+
if not data:
|
| 67 |
+
return 0.0
|
| 68 |
+
import math
|
| 69 |
+
freq = Counter(data)
|
| 70 |
+
total = len(data)
|
| 71 |
+
return -sum((c / total) * math.log2(c / total) for c in freq.values())
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def unique_byte_ratio(data: bytes) -> str:
|
| 75 |
+
"""Return unique bytes count."""
|
| 76 |
+
return f"{len(set(data))}/256"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def check_known_headers(data: bytes) -> list[str]:
|
| 80 |
+
"""Check if data starts with known file/compression magic numbers."""
|
| 81 |
+
findings = []
|
| 82 |
+
if len(data) < 4:
|
| 83 |
+
return findings
|
| 84 |
+
|
| 85 |
+
# Magic number checks
|
| 86 |
+
magics = {
|
| 87 |
+
b"\x08": "Protobuf varint field tag (field 1, wire type 0)",
|
| 88 |
+
b"\x0a": "Protobuf length-delimited field tag (field 1, wire type 2)",
|
| 89 |
+
b"\x78\x01": "Zlib (low compression)",
|
| 90 |
+
b"\x78\x5e": "Zlib (default compression)",
|
| 91 |
+
b"\x78\x9c": "Zlib (best speed/default)",
|
| 92 |
+
b"\x78\xda": "Zlib (best compression)",
|
| 93 |
+
b"\x1f\x8b": "Gzip",
|
| 94 |
+
b"\x04\x22\x4d\x18": "LZ4 frame",
|
| 95 |
+
b"\x28\xb5\x2f\xfd": "Zstandard",
|
| 96 |
+
b"\xfd\x37\x7a\x58\x5a\x00": "XZ",
|
| 97 |
+
b"\x42\x5a\x68": "Bzip2",
|
| 98 |
+
b"PK": "ZIP archive",
|
| 99 |
+
b"\x89PNG": "PNG image",
|
| 100 |
+
b"ONNX": "ONNX text",
|
| 101 |
+
b"\x08\x00": "Protobuf: field 1, varint, value will follow",
|
| 102 |
+
b"\x08\x01": "Protobuf: field 1, varint = 1 (could be magic_number=1!)",
|
| 103 |
+
b"\x08\x02": "Protobuf: field 1, varint = 2",
|
| 104 |
+
b"\x08\x03": "Protobuf: field 1, varint = 3",
|
| 105 |
+
b"\x08\x04": "Protobuf: field 1, varint = 4",
|
| 106 |
+
b"\x50\x42": "Possible PB (protobuf) marker",
|
| 107 |
+
b"\x01\x00\x00\x00": "uint32 LE = 1 (possible magic_number=1)",
|
| 108 |
+
b"\x00\x00\x00\x01": "uint32 BE = 1 (possible magic_number=1)",
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
for magic, desc in magics.items():
|
| 112 |
+
if data[:len(magic)] == magic:
|
| 113 |
+
findings.append(f" ★ MATCH: {desc} ({magic.hex()})")
|
| 114 |
+
|
| 115 |
+
# Check first uint32 LE/BE
|
| 116 |
+
u32_le = struct.unpack_from("<I", data, 0)[0]
|
| 117 |
+
u32_be = struct.unpack_from(">I", data, 0)[0]
|
| 118 |
+
if u32_le == 1:
|
| 119 |
+
findings.append(f" ★ uint32_LE at offset 0 = 1 (MAGIC_NUMBER match!)")
|
| 120 |
+
if u32_be == 1:
|
| 121 |
+
findings.append(f" ★ uint32_BE at offset 0 = 1 (MAGIC_NUMBER match!)")
|
| 122 |
+
|
| 123 |
+
return findings
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def try_decompress(data: bytes, label: str = "") -> Optional[bytes]:
|
| 127 |
+
"""Try various decompression methods."""
|
| 128 |
+
results = []
|
| 129 |
+
|
| 130 |
+
# Zlib (with and without header)
|
| 131 |
+
for wbits in [15, -15, 31]: # standard, raw deflate, gzip
|
| 132 |
+
try:
|
| 133 |
+
dec = zlib.decompress(data, wbits)
|
| 134 |
+
results.append(("zlib" + (f" wbits={wbits}" if wbits != 15 else ""), dec))
|
| 135 |
+
except:
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
# LZ4
|
| 139 |
+
try:
|
| 140 |
+
import lz4.frame
|
| 141 |
+
dec = lz4.frame.decompress(data)
|
| 142 |
+
results.append(("lz4.frame", dec))
|
| 143 |
+
except:
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
import lz4.block
|
| 148 |
+
for size in [1 << 20, 1 << 22, 1 << 24]:
|
| 149 |
+
try:
|
| 150 |
+
dec = lz4.block.decompress(data, uncompressed_size=size)
|
| 151 |
+
results.append((f"lz4.block (uncompressed_size={size})", dec))
|
| 152 |
+
break
|
| 153 |
+
except:
|
| 154 |
+
pass
|
| 155 |
+
except:
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
# Zstandard
|
| 159 |
+
try:
|
| 160 |
+
import zstandard as zstd
|
| 161 |
+
dctx = zstd.ZstdDecompressor()
|
| 162 |
+
dec = dctx.decompress(data, max_output_size=len(data) * 10)
|
| 163 |
+
results.append(("zstandard", dec))
|
| 164 |
+
except:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
if results:
|
| 168 |
+
for method, dec in results:
|
| 169 |
+
print(f" ✓ {label} Decompression SUCCESS with {method}: {len(dec)} bytes")
|
| 170 |
+
print(f" First 64 bytes: {dec[:64].hex()}")
|
| 171 |
+
print(f" Entropy: {entropy(dec[:4096]):.3f}, unique: {unique_byte_ratio(dec[:4096])}")
|
| 172 |
+
headers = check_known_headers(dec)
|
| 173 |
+
for h in headers:
|
| 174 |
+
print(f" {h}")
|
| 175 |
+
return results[0][1]
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def decrypt_aes_cfb(data: bytes, key: bytes, iv: bytes, segment_size: int = 8) -> Optional[bytes]:
|
| 180 |
+
"""Decrypt using AES-CFB with given parameters."""
|
| 181 |
+
if HAS_PYCRYPTODOME:
|
| 182 |
+
try:
|
| 183 |
+
cipher = PyCryptoAES.new(key, PyCryptoAES.MODE_CFB, iv=iv, segment_size=segment_size)
|
| 184 |
+
return cipher.decrypt(data)
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
if HAS_CRYPTOGRAPHY:
|
| 189 |
+
try:
|
| 190 |
+
if segment_size == 128:
|
| 191 |
+
cipher = Cipher(algorithms.AES(key), modes.CFB(iv), backend=default_backend())
|
| 192 |
+
elif segment_size == 8:
|
| 193 |
+
cipher = Cipher(algorithms.AES(key), modes.CFB8(iv), backend=default_backend())
|
| 194 |
+
else:
|
| 195 |
+
return None
|
| 196 |
+
decryptor = cipher.decryptor()
|
| 197 |
+
return decryptor.update(data) + decryptor.finalize()
|
| 198 |
+
except Exception as e:
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
return None
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def analyze_decrypted(data: bytes, label: str) -> bool:
|
| 205 |
+
"""Analyze decrypted data and return True if it looks promising."""
|
| 206 |
+
if data is None:
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
ent = entropy(data[:4096])
|
| 210 |
+
unique = unique_byte_ratio(data[:4096])
|
| 211 |
+
headers = check_known_headers(data)
|
| 212 |
+
|
| 213 |
+
is_promising = (
|
| 214 |
+
ent < 7.5 or # reduced entropy
|
| 215 |
+
len(headers) > 0 or # known header match
|
| 216 |
+
data[:4] == b"\x01\x00\x00\x00" or # magic_number = 1 LE
|
| 217 |
+
data[:4] == b"\x00\x00\x00\x01" or # magic_number = 1 BE
|
| 218 |
+
data[:2] == b"\x08\x01" # protobuf magic_number = 1
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
if is_promising:
|
| 222 |
+
print(f" ★★★ PROMISING: {label}")
|
| 223 |
+
print(f" Entropy: {ent:.3f}, Unique bytes: {unique}")
|
| 224 |
+
print(f" First 128 bytes:")
|
| 225 |
+
print(hex_dump(data[:128]))
|
| 226 |
+
for h in headers:
|
| 227 |
+
print(f" {h}")
|
| 228 |
+
|
| 229 |
+
# Try decompression on promising results
|
| 230 |
+
try_decompress(data, label)
|
| 231 |
+
|
| 232 |
+
# If starts with protobuf-like data or magic=1, also try decompressing after skipping some bytes
|
| 233 |
+
for skip in [4, 8, 12, 16, 20]:
|
| 234 |
+
if len(data) > skip + 10:
|
| 235 |
+
try_decompress(data[skip:], f"{label} [skip {skip} bytes]")
|
| 236 |
+
|
| 237 |
+
return True
|
| 238 |
+
return False
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ═══════════════════════════════════════════════════════════════
|
| 242 |
+
# MAIN ANALYSIS
|
| 243 |
+
# ═══════════════════════════════════════════════════════════════
|
| 244 |
+
|
| 245 |
+
def main():
|
| 246 |
+
print("=" * 80)
|
| 247 |
+
print("OneOCR .onemodel File Analysis & Decryption Attempt")
|
| 248 |
+
print("=" * 80)
|
| 249 |
+
|
| 250 |
+
# ── Step 1: Read file ──
|
| 251 |
+
with open(MODEL_PATH, "rb") as f:
|
| 252 |
+
full_data = f.read()
|
| 253 |
+
|
| 254 |
+
filesize = len(full_data)
|
| 255 |
+
print(f"\nFile size: {filesize:,} bytes ({filesize/1024/1024:.2f} MB)")
|
| 256 |
+
|
| 257 |
+
# ── Step 2: Parse top-level structure ──
|
| 258 |
+
print("\n" + "═" * 80)
|
| 259 |
+
print("SECTION 1: FILE STRUCTURE ANALYSIS")
|
| 260 |
+
print("═" * 80)
|
| 261 |
+
|
| 262 |
+
header_offset = struct.unpack_from("<I", full_data, 0)[0]
|
| 263 |
+
field_at_4 = struct.unpack_from("<I", full_data, 4)[0]
|
| 264 |
+
print(f"\n [0-3] uint32_LE (header_offset/size): {header_offset} (0x{header_offset:08x})")
|
| 265 |
+
print(f" [4-7] uint32_LE: {field_at_4} (0x{field_at_4:08x})")
|
| 266 |
+
|
| 267 |
+
# Check if it's a uint64
|
| 268 |
+
u64_at_0 = struct.unpack_from("<Q", full_data, 0)[0]
|
| 269 |
+
print(f" [0-7] uint64_LE: {u64_at_0} (0x{u64_at_0:016x})")
|
| 270 |
+
|
| 271 |
+
# Analyze the metadata at offset 22636
|
| 272 |
+
print(f"\n At offset {header_offset} (0x{header_offset:04x}):")
|
| 273 |
+
meta_magic_8 = full_data[header_offset:header_offset+8]
|
| 274 |
+
meta_size = struct.unpack_from("<Q", full_data, header_offset + 8)[0]
|
| 275 |
+
print(f" [+0..+7] 8 bytes: {meta_magic_8.hex()}")
|
| 276 |
+
print(f" [+8..+15] uint64_LE: {meta_size:,} (0x{meta_size:016x})")
|
| 277 |
+
encrypted_start = header_offset + 16
|
| 278 |
+
encrypted_size = meta_size
|
| 279 |
+
print(f" Encrypted payload: offset {encrypted_start} ({encrypted_start:#x}), size {encrypted_size:,}")
|
| 280 |
+
print(f" Check: {encrypted_start} + {encrypted_size} = {encrypted_start + encrypted_size} "
|
| 281 |
+
f"vs filesize {filesize} → {'MATCH ✓' if encrypted_start + encrypted_size == filesize else 'MISMATCH ✗'}")
|
| 282 |
+
|
| 283 |
+
# ── Step 3: Analyze header region ──
|
| 284 |
+
print(f"\n Header region [8 .. {header_offset-1}]: {header_offset - 8} bytes")
|
| 285 |
+
header_data = full_data[8:header_offset]
|
| 286 |
+
print(f" Entropy: {entropy(header_data[:4096]):.3f}")
|
| 287 |
+
print(f" Unique bytes (first 4KB): {unique_byte_ratio(header_data[:4096])}")
|
| 288 |
+
print(f" Null bytes: {header_data.count(0)}/{len(header_data)}")
|
| 289 |
+
|
| 290 |
+
# ── Step 4: Analyze encrypted payload region ──
|
| 291 |
+
print(f"\n Encrypted payload [{encrypted_start} .. {filesize-1}]: {encrypted_size:,} bytes")
|
| 292 |
+
payload_sample = full_data[encrypted_start:encrypted_start+4096]
|
| 293 |
+
print(f" Entropy (first 4KB): {entropy(payload_sample):.3f}")
|
| 294 |
+
print(f" Unique bytes (first 4KB): {unique_byte_ratio(payload_sample)}")
|
| 295 |
+
|
| 296 |
+
# ── Step 5: Look for structure in metadata ──
|
| 297 |
+
print(f"\n Detailed metadata dump at offset {header_offset}:")
|
| 298 |
+
print(hex_dump(full_data[header_offset:header_offset+128], offset=header_offset))
|
| 299 |
+
|
| 300 |
+
# Parse more fields from the metadata region
|
| 301 |
+
print(f"\n Parsing fields after metadata header:")
|
| 302 |
+
meta_region = full_data[header_offset:header_offset + 256]
|
| 303 |
+
for i in range(0, 128, 4):
|
| 304 |
+
u32 = struct.unpack_from("<I", meta_region, i)[0]
|
| 305 |
+
if u32 > 0 and u32 < filesize:
|
| 306 |
+
print(f" +{i:3d}: u32={u32:12,d} (0x{u32:08x})"
|
| 307 |
+
f" {'← could be offset/size' if 100 < u32 < filesize else ''}")
|
| 308 |
+
|
| 309 |
+
# ── Step 6: Hash analysis of key ──
|
| 310 |
+
print("\n" + "═" * 80)
|
| 311 |
+
print("SECTION 2: KEY ANALYSIS")
|
| 312 |
+
print("═" * 80)
|
| 313 |
+
print(f"\n Raw key ({len(KEY_RAW)} bytes): {KEY_RAW}")
|
| 314 |
+
print(f" Raw key hex: {KEY_RAW.hex()}")
|
| 315 |
+
print(f" SHA256 of key: {KEY_SHA256.hex()}")
|
| 316 |
+
|
| 317 |
+
# Check if SHA256 of key appears in the file header
|
| 318 |
+
if KEY_SHA256 in full_data[:header_offset + 256]:
|
| 319 |
+
idx = full_data.index(KEY_SHA256)
|
| 320 |
+
print(f" ★ SHA256 of key FOUND in file at offset {idx}!")
|
| 321 |
+
else:
|
| 322 |
+
print(f" SHA256 of key not found in first {header_offset + 256} bytes")
|
| 323 |
+
|
| 324 |
+
# Check if the 8-byte magic at offset 22636 could be related to key hash
|
| 325 |
+
key_sha256_first8 = KEY_SHA256[:8]
|
| 326 |
+
print(f" First 8 bytes of SHA256(key): {key_sha256_first8.hex()}")
|
| 327 |
+
print(f" 8 bytes at offset {header_offset}: {meta_magic_8.hex()}")
|
| 328 |
+
print(f" Match: {'YES ★' if key_sha256_first8 == meta_magic_8 else 'NO'}")
|
| 329 |
+
|
| 330 |
+
# ── Step 7: Decryption attempts ──
|
| 331 |
+
print("\n" + "═" * 80)
|
| 332 |
+
print("SECTION 3: DECRYPTION ATTEMPTS")
|
| 333 |
+
print("═" * 80)
|
| 334 |
+
|
| 335 |
+
# Prepare IV candidates
|
| 336 |
+
iv_zero = b"\x00" * 16
|
| 337 |
+
iv_from_8 = full_data[8:24]
|
| 338 |
+
iv_from_4 = full_data[4:20]
|
| 339 |
+
iv_from_file_start = full_data[0:16]
|
| 340 |
+
iv_from_meta = full_data[header_offset:header_offset + 16]
|
| 341 |
+
iv_from_meta_8 = meta_magic_8 + b"\x00" * 8 # pad the 8-byte magic to 16
|
| 342 |
+
|
| 343 |
+
# SHA256 of key, take first 16 bytes as IV
|
| 344 |
+
iv_sha256_key_first16 = KEY_SHA256[:16]
|
| 345 |
+
|
| 346 |
+
iv_candidates = {
|
| 347 |
+
"all-zeros": iv_zero,
|
| 348 |
+
"file[8:24]": iv_from_8,
|
| 349 |
+
"file[4:20]": iv_from_4,
|
| 350 |
+
"file[0:16]": iv_from_file_start,
|
| 351 |
+
f"file[{header_offset}:{header_offset+16}]": iv_from_meta,
|
| 352 |
+
"meta_magic+padding": iv_from_meta_8,
|
| 353 |
+
"SHA256(key)[:16]": iv_sha256_key_first16,
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
# Key candidates
|
| 357 |
+
key_candidates = {
|
| 358 |
+
"RAW key (32 bytes)": KEY_RAW,
|
| 359 |
+
"SHA256(RAW key)": KEY_SHA256,
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
# Data regions to try decrypting
|
| 363 |
+
# We try both the header data and the start of the encrypted payload
|
| 364 |
+
regions = {
|
| 365 |
+
"header[8:22636]": full_data[8:min(8 + 4096, header_offset)],
|
| 366 |
+
f"payload[{encrypted_start}:]": full_data[encrypted_start:encrypted_start + 4096],
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
# Also try: what if the entire region from byte 8 to end is one encrypted blob?
|
| 370 |
+
regions["all_encrypted[8:]"] = full_data[8:8 + 4096]
|
| 371 |
+
|
| 372 |
+
# Segment sizes: Windows BCrypt CFB defaults to 8-bit (CFB8), also try 128-bit (CFB128)
|
| 373 |
+
segment_sizes = [8, 128]
|
| 374 |
+
|
| 375 |
+
total_attempts = 0
|
| 376 |
+
promising_results = []
|
| 377 |
+
|
| 378 |
+
for key_name, key in key_candidates.items():
|
| 379 |
+
for iv_name, iv in iv_candidates.items():
|
| 380 |
+
for seg_size in segment_sizes:
|
| 381 |
+
for region_name, region_data in regions.items():
|
| 382 |
+
total_attempts += 1
|
| 383 |
+
label = f"key={key_name}, iv={iv_name}, CFB{seg_size}, region={region_name}"
|
| 384 |
+
|
| 385 |
+
decrypted = decrypt_aes_cfb(region_data, key, iv, seg_size)
|
| 386 |
+
if decrypted and analyze_decrypted(decrypted, label):
|
| 387 |
+
promising_results.append(label)
|
| 388 |
+
|
| 389 |
+
print(f"\n Total attempts: {total_attempts}")
|
| 390 |
+
print(f" Promising results: {len(promising_results)}")
|
| 391 |
+
|
| 392 |
+
# ── Step 8: Additional IV strategies ──
|
| 393 |
+
print("\n" + "═" * 80)
|
| 394 |
+
print("SECTION 4: ADVANCED IV STRATEGIES")
|
| 395 |
+
print("═" * 80)
|
| 396 |
+
|
| 397 |
+
# Strategy: IV might be derived from the file content
|
| 398 |
+
# Try every 16-byte aligned position in the first 256 bytes as IV
|
| 399 |
+
print("\n Trying every 16-byte aligned offset in first 256 bytes as IV...")
|
| 400 |
+
|
| 401 |
+
for iv_offset in range(0, 256, 4): # try every 4-byte step
|
| 402 |
+
iv_cand = full_data[iv_offset:iv_offset + 16]
|
| 403 |
+
if len(iv_cand) < 16:
|
| 404 |
+
continue
|
| 405 |
+
|
| 406 |
+
for key in [KEY_RAW, KEY_SHA256]:
|
| 407 |
+
for seg in [8, 128]:
|
| 408 |
+
# Try decrypting the payload
|
| 409 |
+
payload_start = encrypted_start
|
| 410 |
+
test_data = full_data[payload_start:payload_start + 4096]
|
| 411 |
+
decrypted = decrypt_aes_cfb(test_data, key, iv_cand, seg)
|
| 412 |
+
if decrypted:
|
| 413 |
+
is_good = analyze_decrypted(decrypted,
|
| 414 |
+
f"iv_offset={iv_offset}, key={'raw' if key == KEY_RAW else 'sha256'}, CFB{seg}, payload")
|
| 415 |
+
if is_good:
|
| 416 |
+
promising_results.append(f"Advanced: iv_offset={iv_offset}")
|
| 417 |
+
|
| 418 |
+
# Try decrypting from byte 8 (header encrypted area)
|
| 419 |
+
test_data2 = full_data[8:8 + 4096]
|
| 420 |
+
decrypted2 = decrypt_aes_cfb(test_data2, key, iv_cand, seg)
|
| 421 |
+
if decrypted2:
|
| 422 |
+
is_good = analyze_decrypted(decrypted2,
|
| 423 |
+
f"iv_offset={iv_offset}, key={'raw' if key == KEY_RAW else 'sha256'}, CFB{seg}, header[8:]")
|
| 424 |
+
if is_good:
|
| 425 |
+
promising_results.append(f"Advanced: iv_offset={iv_offset} header")
|
| 426 |
+
|
| 427 |
+
# ── Step 9: Try with IV = SHA256 of various things ──
|
| 428 |
+
print("\n" + "═" * 80)
|
| 429 |
+
print("SECTION 5: DERIVED IV STRATEGIES")
|
| 430 |
+
print("═" * 80)
|
| 431 |
+
|
| 432 |
+
derived_ivs = {
|
| 433 |
+
"SHA256(key)[:16]": hashlib.sha256(KEY_RAW).digest()[:16],
|
| 434 |
+
"SHA256(key)[16:]": hashlib.sha256(KEY_RAW).digest()[16:],
|
| 435 |
+
"SHA256('')[:16]": hashlib.sha256(b"").digest()[:16],
|
| 436 |
+
"MD5(key)": hashlib.md5(KEY_RAW).digest(),
|
| 437 |
+
"SHA256(file[0:8])[:16]": hashlib.sha256(full_data[0:8]).digest()[:16],
|
| 438 |
+
"SHA256(file[0:4])[:16]": hashlib.sha256(full_data[0:4]).digest()[:16],
|
| 439 |
+
"SHA256('oneocr')[:16]": hashlib.sha256(b"oneocr").digest()[:16],
|
| 440 |
+
"SHA256('oneocr.onemodel')[:16]": hashlib.sha256(b"oneocr.onemodel").digest()[:16],
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
for iv_name, iv in derived_ivs.items():
|
| 444 |
+
for key_name, key in key_candidates.items():
|
| 445 |
+
for seg in [8, 128]:
|
| 446 |
+
for region_name, region_data in regions.items():
|
| 447 |
+
label = f"key={key_name}, iv={iv_name}, CFB{seg}, region={region_name}"
|
| 448 |
+
decrypted = decrypt_aes_cfb(region_data, key, iv, seg)
|
| 449 |
+
if decrypted and analyze_decrypted(decrypted, label):
|
| 450 |
+
promising_results.append(label)
|
| 451 |
+
|
| 452 |
+
# ── Step 10: What if the structure is different? ──
|
| 453 |
+
print("\n" + "═" * 80)
|
| 454 |
+
print("SECTION 6: ALTERNATIVE STRUCTURE HYPOTHESES")
|
| 455 |
+
print("═" * 80)
|
| 456 |
+
|
| 457 |
+
# Hypothesis A: Bytes 0-3 = offset, 4-7 = 0, 8-23 = IV, 24+ = encrypted data
|
| 458 |
+
print("\n Hypothesis A: [0-3]=offset, [4-7]=flags, [8-23]=IV, [24+]=encrypted")
|
| 459 |
+
iv_hyp_a = full_data[8:24]
|
| 460 |
+
encrypted_hyp_a = full_data[24:24 + 4096]
|
| 461 |
+
for key_name, key in key_candidates.items():
|
| 462 |
+
for seg in [8, 128]:
|
| 463 |
+
dec = decrypt_aes_cfb(encrypted_hyp_a, key, iv_hyp_a, seg)
|
| 464 |
+
if dec:
|
| 465 |
+
analyze_decrypted(dec, f"HypA: key={key_name}, CFB{seg}")
|
| 466 |
+
|
| 467 |
+
# Hypothesis B: [0-7]=header, [8-23]=IV, [24-22635]=encrypted meta, then payload also encrypted
|
| 468 |
+
print("\n Hypothesis B: [0-7]=header, [22636-22651]=16-byte meta, payload starts at 22652")
|
| 469 |
+
print(f" If meta[22636:22652] contains IV for payload:")
|
| 470 |
+
iv_hyp_b = full_data[header_offset:header_offset + 16]
|
| 471 |
+
enc_payload = full_data[encrypted_start:encrypted_start + 4096]
|
| 472 |
+
for key_name, key in key_candidates.items():
|
| 473 |
+
for seg in [8, 128]:
|
| 474 |
+
dec = decrypt_aes_cfb(enc_payload, key, iv_hyp_b, seg)
|
| 475 |
+
if dec:
|
| 476 |
+
analyze_decrypted(dec, f"HypB: key={key_name}, CFB{seg}, payload with meta-IV")
|
| 477 |
+
|
| 478 |
+
# Hypothesis C: The entire file from byte 8 to end is one encrypted stream (IV = zeros)
|
| 479 |
+
print("\n Hypothesis C: Single encrypted stream from byte 8, IV=zeros")
|
| 480 |
+
single_stream = full_data[8:8 + 4096]
|
| 481 |
+
for key_name, key in key_candidates.items():
|
| 482 |
+
for seg in [8, 128]:
|
| 483 |
+
dec = decrypt_aes_cfb(single_stream, key, iv_zero, seg)
|
| 484 |
+
if dec:
|
| 485 |
+
analyze_decrypted(dec, f"HypC: key={key_name}, CFB{seg}")
|
| 486 |
+
|
| 487 |
+
# Hypothesis D: Encrypted data starts right at byte 0 (the header_size field IS part of encrypted data)
|
| 488 |
+
# This would mean the header_size value 22636 is coincidental
|
| 489 |
+
print("\n Hypothesis D: Encrypted from byte 0, IV=zeros")
|
| 490 |
+
for key_name, key in key_candidates.items():
|
| 491 |
+
for seg in [8, 128]:
|
| 492 |
+
dec = decrypt_aes_cfb(full_data[:4096], key, iv_zero, seg)
|
| 493 |
+
if dec:
|
| 494 |
+
analyze_decrypted(dec, f"HypD: key={key_name}, CFB{seg}, from byte 0")
|
| 495 |
+
|
| 496 |
+
# Hypothesis E: Windows CNG might prepend IV to ciphertext
|
| 497 |
+
# So bytes 0-3 = header_size, 4-7 = 0, 8-23 = IV (embedded in encrypted blob), 24+ = ciphertext
|
| 498 |
+
print("\n Hypothesis E: IV prepended to ciphertext at various offsets")
|
| 499 |
+
for data_start in [0, 4, 8]:
|
| 500 |
+
iv_e = full_data[data_start:data_start + 16]
|
| 501 |
+
ct_e = full_data[data_start + 16:data_start + 16 + 4096]
|
| 502 |
+
for key_name, key in key_candidates.items():
|
| 503 |
+
for seg in [8, 128]:
|
| 504 |
+
dec = decrypt_aes_cfb(ct_e, key, iv_e, seg)
|
| 505 |
+
if dec:
|
| 506 |
+
analyze_decrypted(dec, f"HypE: data_start={data_start}, key={key_name}, CFB{seg}")
|
| 507 |
+
|
| 508 |
+
# ── Step 11: Try OFB and CTR modes too (just in case CFB was misidentified) ──
|
| 509 |
+
print("\n" + "═" * 80)
|
| 510 |
+
print("SECTION 7: ALTERNATIVE CIPHER MODES (OFB, CBC)")
|
| 511 |
+
print("═" * 80)
|
| 512 |
+
|
| 513 |
+
if HAS_PYCRYPTODOME:
|
| 514 |
+
for data_start in [8, 24, encrypted_start]:
|
| 515 |
+
for iv_offset in [0, 4, 8]:
|
| 516 |
+
iv_alt = full_data[iv_offset:iv_offset + 16]
|
| 517 |
+
test_data = full_data[data_start:data_start + 4096]
|
| 518 |
+
for key in [KEY_RAW, KEY_SHA256]:
|
| 519 |
+
key_label = "raw" if key == KEY_RAW else "sha256"
|
| 520 |
+
|
| 521 |
+
# OFB
|
| 522 |
+
try:
|
| 523 |
+
cipher = PyCryptoAES.new(key, PyCryptoAES.MODE_OFB, iv=iv_alt)
|
| 524 |
+
dec = cipher.decrypt(test_data)
|
| 525 |
+
analyze_decrypted(dec, f"OFB: data@{data_start}, iv@{iv_offset}, key={key_label}")
|
| 526 |
+
except:
|
| 527 |
+
pass
|
| 528 |
+
|
| 529 |
+
# CBC (needs padding but try anyway)
|
| 530 |
+
try:
|
| 531 |
+
cipher = PyCryptoAES.new(key, PyCryptoAES.MODE_CBC, iv=iv_alt)
|
| 532 |
+
dec = cipher.decrypt(test_data)
|
| 533 |
+
analyze_decrypted(dec, f"CBC: data@{data_start}, iv@{iv_offset}, key={key_label}")
|
| 534 |
+
except:
|
| 535 |
+
pass
|
| 536 |
+
|
| 537 |
+
# ECB (no IV)
|
| 538 |
+
try:
|
| 539 |
+
cipher = PyCryptoAES.new(key, PyCryptoAES.MODE_ECB)
|
| 540 |
+
# ECB needs data aligned to 16 bytes
|
| 541 |
+
aligned = test_data[:len(test_data) - (len(test_data) % 16)]
|
| 542 |
+
dec = cipher.decrypt(aligned)
|
| 543 |
+
analyze_decrypted(dec, f"ECB: data@{data_start}, key={key_label}")
|
| 544 |
+
except:
|
| 545 |
+
pass
|
| 546 |
+
|
| 547 |
+
# ── Step 12: Summary ──
|
| 548 |
+
print("\n" + "═" * 80)
|
| 549 |
+
print("SUMMARY")
|
| 550 |
+
print("═" * 80)
|
| 551 |
+
|
| 552 |
+
print(f"\n File structure (confirmed):")
|
| 553 |
+
print(f" [0x0000 - 0x0007] 8-byte header: offset = {header_offset}")
|
| 554 |
+
print(f" [0x0008 - 0x{header_offset-1:04x}] Encrypted header data ({header_offset - 8} bytes)")
|
| 555 |
+
print(f" [0x{header_offset:04x} - 0x{header_offset+7:04x}] 8-byte magic/hash: {meta_magic_8.hex()}")
|
| 556 |
+
print(f" [0x{header_offset+8:04x} - 0x{header_offset+15:04x}] uint64 payload size: {meta_size:,}")
|
| 557 |
+
print(f" [0x{encrypted_start:04x} - 0x{filesize-1:07x}] Encrypted payload ({encrypted_size:,} bytes)")
|
| 558 |
+
|
| 559 |
+
print(f"\n Key info:")
|
| 560 |
+
print(f" Raw key: {KEY_RAW}")
|
| 561 |
+
print(f" Raw key hex: {KEY_RAW.hex()}")
|
| 562 |
+
print(f" SHA256(key): {KEY_SHA256.hex()}")
|
| 563 |
+
|
| 564 |
+
print(f"\n Total promising decryption results: {len(promising_results)}")
|
| 565 |
+
for r in promising_results:
|
| 566 |
+
print(f" ★ {r}")
|
| 567 |
+
|
| 568 |
+
if not promising_results:
|
| 569 |
+
print("\n No successful decryption found with standard approaches.")
|
| 570 |
+
print(" Possible reasons:")
|
| 571 |
+
print(" 1. The key might be processed differently (PBKDF2, HKDF, etc.)")
|
| 572 |
+
print(" 2. The IV might be derived in a non-standard way")
|
| 573 |
+
print(" 3. The file structure might be more complex")
|
| 574 |
+
print(" 4. The CBC/CFB segment size might be non-standard")
|
| 575 |
+
print(" 5. There might be additional authentication (AEAD)")
|
| 576 |
+
print(" 6. The BCrypt CNG API might use specific key blob format")
|
| 577 |
+
print(" 7. Think about BCRYPT_KEY_DATA_BLOB_HEADER structure")
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
if __name__ == "__main__":
|
| 581 |
+
main()
|
_archive/analysis/analyze_dx.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze DX index structure to understand chunk record format."""
|
| 2 |
+
import hashlib
|
| 3 |
+
import struct
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from Crypto.Cipher import AES
|
| 7 |
+
|
| 8 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 9 |
+
IV = b"Copyright @ OneO"
|
| 10 |
+
|
| 11 |
+
file_data = Path("ocr_data/oneocr.onemodel").read_bytes()
|
| 12 |
+
|
| 13 |
+
# Step 1: Decrypt DX
|
| 14 |
+
header_hash = file_data[8:24]
|
| 15 |
+
dx_key = hashlib.sha256(KEY + header_hash).digest()
|
| 16 |
+
encrypted_dx = file_data[24:24 + 22624]
|
| 17 |
+
cipher = AES.new(dx_key, AES.MODE_CFB, iv=IV, segment_size=128)
|
| 18 |
+
dx = cipher.decrypt(encrypted_dx)
|
| 19 |
+
assert dx[:2] == b"DX"
|
| 20 |
+
|
| 21 |
+
# Load crypto log
|
| 22 |
+
crypto_log = json.load(open("temp/crypto_log.json"))
|
| 23 |
+
|
| 24 |
+
# Get unique SHA256 inputs in order
|
| 25 |
+
sha_ops = [x for x in crypto_log if x['op'] == 'sha256']
|
| 26 |
+
seen = set()
|
| 27 |
+
unique_sha = []
|
| 28 |
+
for s in sha_ops:
|
| 29 |
+
if s['input'] not in seen:
|
| 30 |
+
seen.add(s['input'])
|
| 31 |
+
unique_sha.append(s)
|
| 32 |
+
|
| 33 |
+
# Get decrypt ops
|
| 34 |
+
dec_ops = [x for x in crypto_log if x['op'] == 'decrypt']
|
| 35 |
+
|
| 36 |
+
# For each SHA256 input, find its position in DX
|
| 37 |
+
print("=" * 80)
|
| 38 |
+
print("DX Index Structure Analysis")
|
| 39 |
+
print("=" * 80)
|
| 40 |
+
print(f"DX size: {len(dx)} bytes, valid: {struct.unpack('<Q', dx[8:16])[0]}")
|
| 41 |
+
print()
|
| 42 |
+
|
| 43 |
+
# Skip first SHA256 (DX key derivation uses master_key + file_header, not DX data)
|
| 44 |
+
print("SHA256 input #0: DX key = SHA256(master_key + file[8:24]) [special case]")
|
| 45 |
+
print()
|
| 46 |
+
|
| 47 |
+
for i, s in enumerate(unique_sha[1:], 1):
|
| 48 |
+
inp = bytes.fromhex(s['input'])
|
| 49 |
+
pos = dx.find(inp)
|
| 50 |
+
|
| 51 |
+
# Also try finding parts of the input
|
| 52 |
+
first_uint64 = inp[:8]
|
| 53 |
+
pos_partial = dx.find(first_uint64)
|
| 54 |
+
|
| 55 |
+
if pos >= 0:
|
| 56 |
+
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} found at DX offset {pos:5d} (0x{pos:04x})")
|
| 57 |
+
elif pos_partial >= 0:
|
| 58 |
+
# The input might be rearranged from DX
|
| 59 |
+
size1 = struct.unpack('<Q', inp[:8])[0]
|
| 60 |
+
size2 = struct.unpack('<Q', inp[8:16])[0]
|
| 61 |
+
checksum = inp[16:] if len(inp) > 16 else b""
|
| 62 |
+
|
| 63 |
+
# Check if sizes and checksum are nearby but in different order
|
| 64 |
+
pos_sizes = dx.find(inp[:16])
|
| 65 |
+
pos_check = dx.find(checksum) if checksum else -1
|
| 66 |
+
|
| 67 |
+
if pos_sizes >= 0:
|
| 68 |
+
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} sizes at DX offset {pos_sizes:5d}, checksum at {pos_check}")
|
| 69 |
+
else:
|
| 70 |
+
# Sizes might be in different order or interleaved
|
| 71 |
+
pos_s1 = dx.find(first_uint64)
|
| 72 |
+
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} first_uint64 at DX offset {pos_s1:5d} (rearranged?)")
|
| 73 |
+
|
| 74 |
+
size1 = struct.unpack('<Q', inp[:8])[0]
|
| 75 |
+
size2 = struct.unpack('<Q', inp[8:16])[0]
|
| 76 |
+
print(f" size1={size1} size2={size2} diff={size2-size1}")
|
| 77 |
+
else:
|
| 78 |
+
size1 = struct.unpack('<Q', inp[:8])[0]
|
| 79 |
+
size2 = struct.unpack('<Q', inp[8:16])[0]
|
| 80 |
+
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} NOT FOUND (size1={size1} size2={size2})")
|
| 81 |
+
|
| 82 |
+
# Now let's dump DX structure around the first few records
|
| 83 |
+
print()
|
| 84 |
+
print("=" * 80)
|
| 85 |
+
print("DX Record Structure (first 128 bytes)")
|
| 86 |
+
print("=" * 80)
|
| 87 |
+
|
| 88 |
+
off = 0
|
| 89 |
+
print(f"[{off:4d}] DX Magic: {dx[off:off+8]!r}")
|
| 90 |
+
off += 8
|
| 91 |
+
print(f"[{off:4d}] Valid Size: {struct.unpack('<Q', dx[off:off+8])[0]}")
|
| 92 |
+
off += 8
|
| 93 |
+
print(f"[{off:4d}] Container: {dx[off:off+8].hex()}")
|
| 94 |
+
off += 8
|
| 95 |
+
val = struct.unpack('<Q', dx[off:off+8])[0]
|
| 96 |
+
print(f"[{off:4d}] Value: {val} (0x{val:x})")
|
| 97 |
+
off += 8
|
| 98 |
+
print(f"[{off:4d}] Checksum: {dx[off:off+16].hex()}")
|
| 99 |
+
off += 16
|
| 100 |
+
s1 = struct.unpack('<Q', dx[off:off+8])[0]
|
| 101 |
+
s2 = struct.unpack('<Q', dx[off+8:off+16])[0]
|
| 102 |
+
print(f"[{off:4d}] Sizes: {s1}, {s2} (diff={s2-s1})")
|
| 103 |
+
off += 16
|
| 104 |
+
|
| 105 |
+
print(f"[{off:4d}] Enc data starts: {dx[off:off+32].hex()}")
|
| 106 |
+
|
| 107 |
+
# The config chunk data is here, 11920 bytes
|
| 108 |
+
config_enc_size = 11920
|
| 109 |
+
config_end = off + config_enc_size
|
| 110 |
+
print(f" Config encrypted data: offset {off} to {config_end} ({config_enc_size} bytes)")
|
| 111 |
+
|
| 112 |
+
# What's after the config?
|
| 113 |
+
print(f"\n--- After config chunk ({config_end}) ---")
|
| 114 |
+
for j in range(0, 80, 16):
|
| 115 |
+
pos = config_end + j
|
| 116 |
+
if pos + 16 > len(dx):
|
| 117 |
+
break
|
| 118 |
+
chunk = dx[pos:pos+16]
|
| 119 |
+
hex_str = ' '.join(f'{b:02x}' for b in chunk)
|
| 120 |
+
ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in chunk)
|
| 121 |
+
print(f" {pos:5d} ({pos:#06x}): {hex_str:<48s} {ascii_str}")
|
| 122 |
+
|
| 123 |
+
# Look at the area around found patterns
|
| 124 |
+
for name, dx_off in [("Chunk2(encrypt) 0x2ed7", 0x2ed7),
|
| 125 |
+
("Chunk4(ONNX) 0x2f80", 0x2f80),
|
| 126 |
+
("Chunk5(ONNX2) 0x4692", 0x4692)]:
|
| 127 |
+
print(f"\n--- Area around {name} ---")
|
| 128 |
+
start = max(0, dx_off - 48)
|
| 129 |
+
for j in range(0, 128, 16):
|
| 130 |
+
pos = start + j
|
| 131 |
+
if pos + 16 > len(dx):
|
| 132 |
+
break
|
| 133 |
+
chunk = dx[pos:pos+16]
|
| 134 |
+
hex_str = ' '.join(f'{b:02x}' for b in chunk)
|
| 135 |
+
ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in chunk)
|
| 136 |
+
marker = " <<<" if pos == dx_off else ""
|
| 137 |
+
print(f" {pos:5d} ({pos:#06x}): {hex_str:<48s} {ascii_str}{marker}")
|
_archive/analysis/analyze_extracted.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Manually parse protobuf structure of extracted files."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
EXTRACT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\extracted_models")
|
| 5 |
+
|
| 6 |
+
def read_varint(data, pos):
|
| 7 |
+
val = 0
|
| 8 |
+
shift = 0
|
| 9 |
+
while pos < len(data):
|
| 10 |
+
b = data[pos]
|
| 11 |
+
pos += 1
|
| 12 |
+
val |= (b & 0x7f) << shift
|
| 13 |
+
if not (b & 0x80):
|
| 14 |
+
break
|
| 15 |
+
shift += 7
|
| 16 |
+
return val, pos
|
| 17 |
+
|
| 18 |
+
def parse_protobuf_fields(data, max_fields=10):
|
| 19 |
+
"""Parse protobuf wire format and return field info."""
|
| 20 |
+
pos = 0
|
| 21 |
+
fields = []
|
| 22 |
+
for _ in range(max_fields):
|
| 23 |
+
if pos >= len(data):
|
| 24 |
+
break
|
| 25 |
+
tag_byte = data[pos]
|
| 26 |
+
field_num = tag_byte >> 3
|
| 27 |
+
wire_type = tag_byte & 0x07
|
| 28 |
+
pos += 1
|
| 29 |
+
|
| 30 |
+
if wire_type == 0: # varint
|
| 31 |
+
val, pos = read_varint(data, pos)
|
| 32 |
+
fields.append((field_num, 'varint', val, None))
|
| 33 |
+
elif wire_type == 2: # length-delimited
|
| 34 |
+
length, pos = read_varint(data, pos)
|
| 35 |
+
if length > len(data) - pos or length < 0:
|
| 36 |
+
fields.append((field_num, 'len-delim', length, 'OVERFLOW'))
|
| 37 |
+
break
|
| 38 |
+
preview = data[pos:pos+min(length, 100)]
|
| 39 |
+
pos += length
|
| 40 |
+
fields.append((field_num, 'len-delim', length, preview))
|
| 41 |
+
elif wire_type == 1: # 64-bit
|
| 42 |
+
val = data[pos:pos+8]
|
| 43 |
+
pos += 8
|
| 44 |
+
fields.append((field_num, '64bit', int.from_bytes(val, 'little'), None))
|
| 45 |
+
elif wire_type == 5: # 32-bit
|
| 46 |
+
val = data[pos:pos+4]
|
| 47 |
+
pos += 4
|
| 48 |
+
fields.append((field_num, '32bit', int.from_bytes(val, 'little'), None))
|
| 49 |
+
else:
|
| 50 |
+
fields.append((field_num, f'wire{wire_type}', 0, 'UNKNOWN'))
|
| 51 |
+
break
|
| 52 |
+
return fields
|
| 53 |
+
|
| 54 |
+
# Check top 10 largest heap files
|
| 55 |
+
files = sorted(
|
| 56 |
+
[f for f in EXTRACT_DIR.glob("*.bin") if "0x271a" in f.name],
|
| 57 |
+
key=lambda f: f.stat().st_size,
|
| 58 |
+
reverse=True
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
print("=" * 70)
|
| 62 |
+
print("PROTOBUF STRUCTURE ANALYSIS of largest heap files")
|
| 63 |
+
print("=" * 70)
|
| 64 |
+
|
| 65 |
+
for f in files[:10]:
|
| 66 |
+
data = open(f, 'rb').read(2048)
|
| 67 |
+
size = f.stat().st_size
|
| 68 |
+
print(f"\n{f.name} ({size//1024}KB):")
|
| 69 |
+
print(f" First 32 bytes: {data[:32].hex()}")
|
| 70 |
+
|
| 71 |
+
fields = parse_protobuf_fields(data)
|
| 72 |
+
for fn, wt, val, preview in fields:
|
| 73 |
+
if wt == 'varint':
|
| 74 |
+
print(f" field={fn} {wt} value={val}")
|
| 75 |
+
elif wt == 'len-delim':
|
| 76 |
+
if preview == 'OVERFLOW':
|
| 77 |
+
print(f" field={fn} {wt} length={val} OVERFLOW!")
|
| 78 |
+
elif val < 200 and preview:
|
| 79 |
+
try:
|
| 80 |
+
txt = preview.decode('utf-8', errors='replace')
|
| 81 |
+
printable = all(c.isprintable() or c in '\n\r\t' for c in txt[:50])
|
| 82 |
+
if printable and len(txt) > 0:
|
| 83 |
+
print(f" field={fn} {wt} length={val} text='{txt[:80]}'")
|
| 84 |
+
else:
|
| 85 |
+
print(f" field={fn} {wt} length={val} hex={preview[:40].hex()}")
|
| 86 |
+
except:
|
| 87 |
+
print(f" field={fn} {wt} length={val} hex={preview[:40].hex()}")
|
| 88 |
+
else:
|
| 89 |
+
if preview:
|
| 90 |
+
print(f" field={fn} {wt} length={val} first_bytes={preview[:20].hex()}")
|
| 91 |
+
else:
|
| 92 |
+
print(f" field={fn} {wt} length={val}")
|
| 93 |
+
else:
|
| 94 |
+
print(f" field={fn} {wt} value={val}")
|
| 95 |
+
|
| 96 |
+
# Also check a mid-sized file that might be a complete model
|
| 97 |
+
print("\n" + "=" * 70)
|
| 98 |
+
print("CHECKING MID-SIZED FILES (100KB - 2MB range)")
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
|
| 101 |
+
mid_files = sorted(
|
| 102 |
+
[f for f in EXTRACT_DIR.glob("*.bin")
|
| 103 |
+
if "0x271a" in f.name and 100*1024 < f.stat().st_size < 2*1024*1024],
|
| 104 |
+
key=lambda f: f.stat().st_size,
|
| 105 |
+
reverse=True
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
import onnx
|
| 109 |
+
valid_count = 0
|
| 110 |
+
for f in mid_files[:100]:
|
| 111 |
+
try:
|
| 112 |
+
m = onnx.load(str(f))
|
| 113 |
+
valid_count += 1
|
| 114 |
+
print(f" VALID: {f.name} ({f.stat().st_size//1024}KB)")
|
| 115 |
+
print(f" ir={m.ir_version} producer='{m.producer_name}' "
|
| 116 |
+
f"graph='{m.graph.name}' nodes={len(m.graph.node)}")
|
| 117 |
+
except:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
if valid_count == 0:
|
| 121 |
+
print(" No valid ONNX models in mid-range files either.")
|
| 122 |
+
|
| 123 |
+
# Check if the largest files might be a container/archive
|
| 124 |
+
print("\n" + "=" * 70)
|
| 125 |
+
print("CHECKING FOR INTERNAL ONNX BOUNDARIES IN LARGEST FILE")
|
| 126 |
+
print("=" * 70)
|
| 127 |
+
|
| 128 |
+
biggest = files[0]
|
| 129 |
+
data = open(biggest, 'rb').read()
|
| 130 |
+
print(f"File: {biggest.name}, total size: {len(data)} bytes")
|
| 131 |
+
|
| 132 |
+
# Search for all occurrences of valid ONNX-like starts
|
| 133 |
+
import re
|
| 134 |
+
# Look for 0x08 [3-9] 0x12 pattern (ir_version + field2)
|
| 135 |
+
pattern = re.compile(b'\\x08[\\x03-\\x09]\\x12')
|
| 136 |
+
matches = [(m.start(), data[m.start()+1]) for m in pattern.finditer(data[:1000])]
|
| 137 |
+
print(f"ONNX-like headers in first 1000 bytes: {len(matches)}")
|
| 138 |
+
for offset, ir in matches[:10]:
|
| 139 |
+
print(f" offset={offset}: ir_version={ir}")
|
| 140 |
+
|
| 141 |
+
# Also search for "ONNX" string, "onnx" string, "graph" string
|
| 142 |
+
for needle in [b'ONNX', b'onnx', b'graph', b'Conv', b'Relu', b'BatchNorm', b'MatMul']:
|
| 143 |
+
positions = [m.start() for m in re.finditer(re.escape(needle), data[:50000])]
|
| 144 |
+
if positions:
|
| 145 |
+
print(f" Found '{needle.decode()}' at offsets: {positions[:5]}")
|
_archive/analysis/analyze_model.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze oneocr.onemodel file format."""
|
| 2 |
+
import os
|
| 3 |
+
import struct
|
| 4 |
+
|
| 5 |
+
MODEL_PATH = r"ocr_data\oneocr.onemodel"
|
| 6 |
+
|
| 7 |
+
with open(MODEL_PATH, "rb") as f:
|
| 8 |
+
data = f.read()
|
| 9 |
+
|
| 10 |
+
print(f"Total size: {len(data)} bytes = {len(data)/1024/1024:.2f} MB")
|
| 11 |
+
print(f"First 8 bytes (hex): {data[:8].hex()}")
|
| 12 |
+
print(f"First 4 bytes as uint32 LE: {struct.unpack('<I', data[:4])[0]}")
|
| 13 |
+
print(f"First 8 bytes as uint64 LE: {struct.unpack('<Q', data[:8])[0]}")
|
| 14 |
+
print()
|
| 15 |
+
|
| 16 |
+
# Search for known patterns
|
| 17 |
+
patterns = [b"onnx", b"ai.onnx", b"ONNX", b"ort_", b"onnxruntime",
|
| 18 |
+
b"ir_version", b"ORTM", b"FORT", b"ORT ", b"model",
|
| 19 |
+
b"graph", b"Conv", b"Relu", b"Softmax", b"tensor",
|
| 20 |
+
b"float", b"int64", b"opset", b"producer"]
|
| 21 |
+
|
| 22 |
+
for pattern in patterns:
|
| 23 |
+
idx = data.find(pattern)
|
| 24 |
+
if idx >= 0:
|
| 25 |
+
ctx_start = max(0, idx - 8)
|
| 26 |
+
ctx_end = min(len(data), idx + len(pattern) + 8)
|
| 27 |
+
print(f"Found '{pattern.decode(errors='replace')}' at offset {idx} (0x{idx:x})")
|
| 28 |
+
print(f" Context hex: {data[ctx_start:ctx_end].hex()}")
|
| 29 |
+
|
| 30 |
+
print()
|
| 31 |
+
|
| 32 |
+
# Check entropy by sections
|
| 33 |
+
import collections
|
| 34 |
+
def entropy_score(chunk):
|
| 35 |
+
c = collections.Counter(chunk)
|
| 36 |
+
unique = len(c)
|
| 37 |
+
return unique
|
| 38 |
+
|
| 39 |
+
print("Entropy analysis (unique byte values per 4KB block):")
|
| 40 |
+
for i in range(0, min(len(data), 64*1024), 4096):
|
| 41 |
+
chunk = data[i:i+4096]
|
| 42 |
+
e = entropy_score(chunk)
|
| 43 |
+
print(f" Offset 0x{i:06x}: {e}/256 unique bytes",
|
| 44 |
+
"(encrypted/compressed)" if e > 240 else "(structured)" if e < 100 else "")
|
| 45 |
+
|
| 46 |
+
# Look at first int as possible header size
|
| 47 |
+
hdr_size = struct.unpack('<I', data[:4])[0]
|
| 48 |
+
print(f"\nFirst uint32 = {hdr_size} (0x{hdr_size:x})")
|
| 49 |
+
print(f"If header size, data starts at offset {hdr_size}")
|
| 50 |
+
if hdr_size < len(data):
|
| 51 |
+
print(f"Data at offset {hdr_size}: {data[hdr_size:hdr_size+32].hex()}")
|
| 52 |
+
|
| 53 |
+
# Check what's at byte 8
|
| 54 |
+
print(f"\nBytes 8-16: {data[8:16].hex()}")
|
| 55 |
+
print(f"If offset 8 is data: unique bytes = {entropy_score(data[8:8+4096])}/256")
|
| 56 |
+
|
| 57 |
+
# XOR analysis - try single byte XOR keys
|
| 58 |
+
print("\nXOR key analysis (checking if XOR of first bytes gives ONNX protobuf header):")
|
| 59 |
+
# ONNX protobuf starts with 0x08 (varint, field 1 = ir_version)
|
| 60 |
+
xor_key_byte0 = data[0] ^ 0x08
|
| 61 |
+
print(f" If first byte should be 0x08: XOR key = 0x{xor_key_byte0:02x}")
|
| 62 |
+
# Try XOR with that key on first 16 bytes
|
| 63 |
+
test = bytes(b ^ xor_key_byte0 for b in data[:16])
|
| 64 |
+
print(f" XOR'd first 16 bytes: {test.hex()}")
|
_archive/analysis/decrypt_config.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Decrypt the config chunk from DX and analyze its protobuf structure.
|
| 2 |
+
Config = first encrypted payload inside DX index.
|
| 3 |
+
"""
|
| 4 |
+
import struct
|
| 5 |
+
import hashlib
|
| 6 |
+
from Crypto.Cipher import AES
|
| 7 |
+
|
| 8 |
+
MASTER_KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 9 |
+
IV = b"Copyright @ OneO"
|
| 10 |
+
|
| 11 |
+
def aes_cfb128_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
| 12 |
+
cipher = AES.new(key, AES.MODE_CFB, iv=iv, segment_size=128)
|
| 13 |
+
return cipher.decrypt(data)
|
| 14 |
+
|
| 15 |
+
def decode_varint(data: bytes, offset: int) -> tuple[int, int]:
|
| 16 |
+
"""Decode protobuf varint, return (value, new_offset)."""
|
| 17 |
+
result = 0
|
| 18 |
+
shift = 0
|
| 19 |
+
while offset < len(data):
|
| 20 |
+
b = data[offset]
|
| 21 |
+
result |= (b & 0x7F) << shift
|
| 22 |
+
offset += 1
|
| 23 |
+
if not (b & 0x80):
|
| 24 |
+
break
|
| 25 |
+
shift += 7
|
| 26 |
+
return result, offset
|
| 27 |
+
|
| 28 |
+
def decode_protobuf_fields(data: bytes, indent: int = 0, max_depth: int = 3, prefix: str = ""):
|
| 29 |
+
"""Recursively decode protobuf-like structure."""
|
| 30 |
+
off = 0
|
| 31 |
+
field_idx = 0
|
| 32 |
+
pad = " " * indent
|
| 33 |
+
while off < len(data) and field_idx < 200:
|
| 34 |
+
if off >= len(data):
|
| 35 |
+
break
|
| 36 |
+
tag_byte = data[off]
|
| 37 |
+
field_num = tag_byte >> 3
|
| 38 |
+
wire_type = tag_byte & 0x07
|
| 39 |
+
|
| 40 |
+
if field_num == 0 or field_num > 30:
|
| 41 |
+
break
|
| 42 |
+
|
| 43 |
+
off += 1
|
| 44 |
+
|
| 45 |
+
if wire_type == 0: # varint
|
| 46 |
+
val, off = decode_varint(data, off)
|
| 47 |
+
print(f"{pad}field {field_num} (varint): {val}")
|
| 48 |
+
elif wire_type == 2: # length-delimited
|
| 49 |
+
length, off = decode_varint(data, off)
|
| 50 |
+
if off + length > len(data):
|
| 51 |
+
print(f"{pad}field {field_num} (bytes, len={length}): TRUNCATED at off={off}")
|
| 52 |
+
break
|
| 53 |
+
payload = data[off:off+length]
|
| 54 |
+
# Try to decode as string
|
| 55 |
+
try:
|
| 56 |
+
s = payload.decode('utf-8')
|
| 57 |
+
if all(c.isprintable() or c in '\n\r\t' for c in s):
|
| 58 |
+
if len(s) > 100:
|
| 59 |
+
print(f"{pad}field {field_num} (string, len={length}): {s[:100]}...")
|
| 60 |
+
else:
|
| 61 |
+
print(f"{pad}field {field_num} (string, len={length}): {s}")
|
| 62 |
+
else:
|
| 63 |
+
raise ValueError()
|
| 64 |
+
except (UnicodeDecodeError, ValueError):
|
| 65 |
+
if indent < max_depth and length > 2 and length < 100000:
|
| 66 |
+
# Try parsing as sub-message
|
| 67 |
+
print(f"{pad}field {field_num} (msg, len={length}):")
|
| 68 |
+
decode_protobuf_fields(payload, indent + 1, max_depth, prefix=f"{prefix}f{field_num}.")
|
| 69 |
+
else:
|
| 70 |
+
print(f"{pad}field {field_num} (bytes, len={length}): {payload[:32].hex()}...")
|
| 71 |
+
off += length
|
| 72 |
+
elif wire_type == 5: # 32-bit
|
| 73 |
+
if off + 4 > len(data):
|
| 74 |
+
break
|
| 75 |
+
val = struct.unpack_from("<I", data, off)[0]
|
| 76 |
+
off += 4
|
| 77 |
+
# Try float interpretation
|
| 78 |
+
fval = struct.unpack_from("<f", data, off-4)[0]
|
| 79 |
+
print(f"{pad}field {field_num} (fixed32): {val} (0x{val:08x}, float={fval:.4f})")
|
| 80 |
+
elif wire_type == 1: # 64-bit
|
| 81 |
+
if off + 8 > len(data):
|
| 82 |
+
break
|
| 83 |
+
val = struct.unpack_from("<Q", data, off)[0]
|
| 84 |
+
off += 8
|
| 85 |
+
print(f"{pad}field {field_num} (fixed64): {val}")
|
| 86 |
+
else:
|
| 87 |
+
print(f"{pad}field {field_num} (wire={wire_type}): unknown, stopping")
|
| 88 |
+
break
|
| 89 |
+
field_idx += 1
|
| 90 |
+
|
| 91 |
+
# Read file
|
| 92 |
+
with open("ocr_data/oneocr.onemodel", "rb") as f:
|
| 93 |
+
fdata = f.read()
|
| 94 |
+
|
| 95 |
+
# Step 1: Decrypt DX
|
| 96 |
+
file_header_hash = fdata[8:24]
|
| 97 |
+
dx_key = hashlib.sha256(MASTER_KEY + file_header_hash).digest()
|
| 98 |
+
dx_encrypted = fdata[24:24+22624]
|
| 99 |
+
dx = aes_cfb128_decrypt(dx_key, IV, dx_encrypted)
|
| 100 |
+
|
| 101 |
+
print("=== DX Header ===")
|
| 102 |
+
print(f"Magic: {dx[:8]}")
|
| 103 |
+
valid_size = struct.unpack_from("<Q", dx, 8)[0]
|
| 104 |
+
print(f"Valid size: {valid_size}")
|
| 105 |
+
print(f"Container magic: {dx[16:24].hex()}")
|
| 106 |
+
total_value = struct.unpack_from("<Q", dx, 24)[0]
|
| 107 |
+
print(f"DX[24] value: {total_value}")
|
| 108 |
+
checksum = dx[32:48]
|
| 109 |
+
print(f"Checksum: {checksum.hex()}")
|
| 110 |
+
s1, s2 = struct.unpack_from("<QQ", dx, 48)
|
| 111 |
+
print(f"Sizes: ({s1}, {s2})")
|
| 112 |
+
|
| 113 |
+
# Step 2: Decrypt config
|
| 114 |
+
sha_input = dx[48:64] + dx[32:48] # sizes + checksum
|
| 115 |
+
config_key = hashlib.sha256(sha_input).digest()
|
| 116 |
+
config_enc = dx[64:64+11920]
|
| 117 |
+
config_dec = aes_cfb128_decrypt(config_key, IV, config_enc)
|
| 118 |
+
|
| 119 |
+
# Save
|
| 120 |
+
with open("temp/config_decrypted.bin", "wb") as f:
|
| 121 |
+
f.write(config_dec)
|
| 122 |
+
print(f"\nConfig decrypted: {len(config_dec)} bytes, saved to temp/config_decrypted.bin")
|
| 123 |
+
|
| 124 |
+
# Check container magic
|
| 125 |
+
magic = config_dec[:8]
|
| 126 |
+
print(f"Config container magic: {magic.hex()}")
|
| 127 |
+
assert magic == bytes.fromhex("4a1a082b25000000"), "Container magic mismatch!"
|
| 128 |
+
|
| 129 |
+
# Strip 8-byte container header
|
| 130 |
+
config_data = config_dec[8:]
|
| 131 |
+
print(f"Config payload: {len(config_data)} bytes")
|
| 132 |
+
|
| 133 |
+
print("\n=== Config Protobuf Structure (top-level fields only) ===")
|
| 134 |
+
# Parse just top-level to see field patterns
|
| 135 |
+
off = 0
|
| 136 |
+
config_fields = []
|
| 137 |
+
while off < len(config_data):
|
| 138 |
+
if off >= len(config_data):
|
| 139 |
+
break
|
| 140 |
+
tag_byte = config_data[off]
|
| 141 |
+
field_num = tag_byte >> 3
|
| 142 |
+
wire_type = tag_byte & 0x07
|
| 143 |
+
if field_num == 0 or field_num > 30:
|
| 144 |
+
break
|
| 145 |
+
off += 1
|
| 146 |
+
if wire_type == 0:
|
| 147 |
+
val, off = decode_varint(config_data, off)
|
| 148 |
+
config_fields.append({"fn": field_num, "wt": wire_type, "val": val, "off": off})
|
| 149 |
+
elif wire_type == 2:
|
| 150 |
+
length, off = decode_varint(config_data, off)
|
| 151 |
+
if off + length > len(config_data):
|
| 152 |
+
break
|
| 153 |
+
payload = config_data[off:off+length]
|
| 154 |
+
# Try string
|
| 155 |
+
try:
|
| 156 |
+
s = payload.decode('ascii')
|
| 157 |
+
readable = all(c.isprintable() or c in '\n\r\t' for c in s)
|
| 158 |
+
except:
|
| 159 |
+
readable = False
|
| 160 |
+
if readable and len(payload) < 200:
|
| 161 |
+
print(f" field {field_num} (string, len={length}, off={off}): {payload[:80]}")
|
| 162 |
+
else:
|
| 163 |
+
# check first bytes for sub-message identification
|
| 164 |
+
fbytes = payload[:16].hex()
|
| 165 |
+
print(f" field {field_num} (msg/bytes, len={length}, off={off}): {fbytes}...")
|
| 166 |
+
config_fields.append({"fn": field_num, "wt": wire_type, "len": length, "off": off, "data": payload})
|
| 167 |
+
off += length
|
| 168 |
+
elif wire_type == 5:
|
| 169 |
+
if off + 4 > len(config_data):
|
| 170 |
+
break
|
| 171 |
+
val = struct.unpack_from("<I", config_data, off)[0]
|
| 172 |
+
config_fields.append({"fn": field_num, "wt": wire_type, "val": val, "off": off})
|
| 173 |
+
off += 4
|
| 174 |
+
elif wire_type == 1:
|
| 175 |
+
if off + 8 > len(config_data):
|
| 176 |
+
break
|
| 177 |
+
val = struct.unpack_from("<Q", config_data, off)[0]
|
| 178 |
+
config_fields.append({"fn": field_num, "wt": wire_type, "val": val, "off": off})
|
| 179 |
+
off += 8
|
| 180 |
+
else:
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
# Count field types
|
| 184 |
+
from collections import Counter
|
| 185 |
+
field_counts = Counter(f["fn"] for f in config_fields)
|
| 186 |
+
print(f"\nField type counts: {dict(field_counts)}")
|
| 187 |
+
print(f"Total fields: {len(config_fields)}")
|
| 188 |
+
|
| 189 |
+
# Decode each field 1 (repeated message) to find model entries
|
| 190 |
+
print("\n=== Model entries (field 1) ===")
|
| 191 |
+
f1_entries = [f for f in config_fields if f["fn"] == 1 and "data" in f]
|
| 192 |
+
for i, entry in enumerate(f1_entries):
|
| 193 |
+
data = entry["data"]
|
| 194 |
+
# Parse sub-fields
|
| 195 |
+
sub_off = 0
|
| 196 |
+
name = ""
|
| 197 |
+
model_type = -1
|
| 198 |
+
onnx_path = ""
|
| 199 |
+
while sub_off < len(data):
|
| 200 |
+
tag = data[sub_off]
|
| 201 |
+
fn = tag >> 3
|
| 202 |
+
wt = tag & 7
|
| 203 |
+
if fn == 0 or fn > 20:
|
| 204 |
+
break
|
| 205 |
+
sub_off += 1
|
| 206 |
+
if wt == 0:
|
| 207 |
+
val, sub_off = decode_varint(data, sub_off)
|
| 208 |
+
if fn == 2:
|
| 209 |
+
model_type = val
|
| 210 |
+
elif wt == 2:
|
| 211 |
+
ln, sub_off = decode_varint(data, sub_off)
|
| 212 |
+
if sub_off + ln > len(data):
|
| 213 |
+
break
|
| 214 |
+
p = data[sub_off:sub_off+ln]
|
| 215 |
+
if fn == 1:
|
| 216 |
+
try:
|
| 217 |
+
name = p.decode('ascii')
|
| 218 |
+
except:
|
| 219 |
+
name = p.hex()
|
| 220 |
+
elif fn == 3:
|
| 221 |
+
try:
|
| 222 |
+
onnx_path = p.decode('ascii', errors='replace')
|
| 223 |
+
except:
|
| 224 |
+
onnx_path = p.hex()
|
| 225 |
+
sub_off += ln
|
| 226 |
+
elif wt == 5:
|
| 227 |
+
sub_off += 4
|
| 228 |
+
elif wt == 1:
|
| 229 |
+
sub_off += 8
|
| 230 |
+
else:
|
| 231 |
+
break
|
| 232 |
+
print(f" [{i:02d}] name={name:20s} type={model_type}")
|
| 233 |
+
if onnx_path:
|
| 234 |
+
print(f" path={onnx_path[:80]}")
|
| 235 |
+
|
| 236 |
+
# Now look for checksums in the ENTIRE config (not just protobuf)
|
| 237 |
+
print("\n=== Searching ALL known checksums in config ===")
|
| 238 |
+
import json
|
| 239 |
+
with open("temp/crypto_log.json") as f:
|
| 240 |
+
log = json.load(f)
|
| 241 |
+
sha256s = [op for op in log if op["op"] == "sha256"]
|
| 242 |
+
|
| 243 |
+
# Get all unique checksums from 32-byte SHA256 inputs
|
| 244 |
+
checksums_found = 0
|
| 245 |
+
for s in sha256s:
|
| 246 |
+
inp = bytes.fromhex(s["input"])
|
| 247 |
+
if len(inp) == 32:
|
| 248 |
+
chk = inp[16:32] # last 16 bytes = checksum
|
| 249 |
+
pos = config_data.find(chk)
|
| 250 |
+
if pos >= 0:
|
| 251 |
+
checksums_found += 1
|
| 252 |
+
if checksums_found <= 5:
|
| 253 |
+
sizes = struct.unpack_from("<QQ", inp, 0)
|
| 254 |
+
print(f" FOUND checksum at config offset {pos}: sizes={sizes}")
|
| 255 |
+
pos2 = config_dec.find(chk)
|
| 256 |
+
if pos2 >= 0 and pos2 < 8:
|
| 257 |
+
pass # In container header
|
| 258 |
+
|
| 259 |
+
print(f"Total checksums found in config: {checksums_found} / {len([s for s in sha256s if len(bytes.fromhex(s['input'])) == 32])}")
|
_archive/analysis/find_chunks.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Find all chunk checksums and their positions in the .onemodel file."""
|
| 2 |
+
import struct, json
|
| 3 |
+
|
| 4 |
+
with open("ocr_data/oneocr.onemodel", "rb") as f:
|
| 5 |
+
fdata = f.read()
|
| 6 |
+
log = json.load(open("temp/crypto_log.json"))
|
| 7 |
+
|
| 8 |
+
sha256s = [op for op in log if op["op"] == "sha256"]
|
| 9 |
+
sha_map = {}
|
| 10 |
+
for s in sha256s:
|
| 11 |
+
sha_map[s["output"]] = s["input"]
|
| 12 |
+
|
| 13 |
+
decrypts = [op for op in log if op["op"] == "decrypt"]
|
| 14 |
+
|
| 15 |
+
print(f"File size: {len(fdata)} bytes")
|
| 16 |
+
print(f"Payload starts at: 22684")
|
| 17 |
+
|
| 18 |
+
# For each decrypt, find its checksum in the file
|
| 19 |
+
results = []
|
| 20 |
+
for i, d in enumerate(decrypts[1:], 1): # skip DX (dec#00)
|
| 21 |
+
sha_inp = bytes.fromhex(sha_map[d["aes_key"]])
|
| 22 |
+
if len(sha_inp) < 32:
|
| 23 |
+
continue
|
| 24 |
+
chk = sha_inp[16:32]
|
| 25 |
+
s1, s2 = struct.unpack_from("<QQ", sha_inp, 0)
|
| 26 |
+
enc_size = d["input_size"]
|
| 27 |
+
|
| 28 |
+
pos = fdata.find(chk)
|
| 29 |
+
results.append({
|
| 30 |
+
"dec_idx": i,
|
| 31 |
+
"chk_file_offset": pos,
|
| 32 |
+
"chk_hex": chk.hex(),
|
| 33 |
+
"size1": s1,
|
| 34 |
+
"size2": s2,
|
| 35 |
+
"enc_size": enc_size,
|
| 36 |
+
})
|
| 37 |
+
|
| 38 |
+
# Sort by checksum file offset
|
| 39 |
+
results.sort(key=lambda r: r["chk_file_offset"])
|
| 40 |
+
|
| 41 |
+
print(f"\n{'dec#':>5} {'chk_offset':>12} {'data_offset':>12} {'enc_size':>10} {'end_offset':>12} {'size1':>10} {'size2':>10}")
|
| 42 |
+
print("-" * 90)
|
| 43 |
+
for r in results:
|
| 44 |
+
if r["chk_file_offset"] >= 0:
|
| 45 |
+
# The chunk header is: 4_bytes + 16_checksum + 8_size1 + 8_size2 = 36 bytes
|
| 46 |
+
# Data starts at chk_offset - 4 + 36 = chk_offset + 32
|
| 47 |
+
data_off = r["chk_file_offset"] + 32
|
| 48 |
+
end_off = data_off + r["enc_size"]
|
| 49 |
+
print(f" {r['dec_idx']:3d} {r['chk_file_offset']:12d} {data_off:12d} {r['enc_size']:10d} {end_off:12d} {r['size1']:10d} {r['size2']:10d}")
|
| 50 |
+
else:
|
| 51 |
+
print(f" {r['dec_idx']:3d} NOT FOUND {r['enc_size']:10d} {r['size1']:10d} {r['size2']:10d}")
|
| 52 |
+
|
| 53 |
+
# Verify chunk continuity
|
| 54 |
+
print("\n=== Chunk continuity check ===")
|
| 55 |
+
prev_end = None
|
| 56 |
+
for r in results:
|
| 57 |
+
if r["chk_file_offset"] < 0:
|
| 58 |
+
continue
|
| 59 |
+
data_off = r["chk_file_offset"] + 32
|
| 60 |
+
chunk_header_start = r["chk_file_offset"] - 4 # 4 bytes before checksum
|
| 61 |
+
|
| 62 |
+
if prev_end is not None:
|
| 63 |
+
gap = chunk_header_start - prev_end
|
| 64 |
+
if gap != 0:
|
| 65 |
+
print(f" Gap between chunks: {gap} bytes (prev_end={prev_end}, next_header={chunk_header_start})")
|
| 66 |
+
if gap > 0:
|
| 67 |
+
gap_data = fdata[prev_end:chunk_header_start]
|
| 68 |
+
print(f" Gap bytes: {gap_data.hex()}")
|
| 69 |
+
|
| 70 |
+
prev_end = data_off + r["enc_size"]
|
| 71 |
+
|
| 72 |
+
print(f"\nExpected file end: {prev_end}")
|
| 73 |
+
print(f"Actual file end: {len(fdata)}")
|
| 74 |
+
|
| 75 |
+
# Verify the 4 bytes before each checksum
|
| 76 |
+
print("\n=== 4 bytes before each checksum ===")
|
| 77 |
+
for r in results[:10]:
|
| 78 |
+
if r["chk_file_offset"] >= 4:
|
| 79 |
+
pre = fdata[r["chk_file_offset"]-4:r["chk_file_offset"]]
|
| 80 |
+
print(f" dec#{r['dec_idx']:02d}: pre_bytes={pre.hex()} ({struct.unpack_from('<I', pre)[0]})")
|
_archive/analysis/walk_payload.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Walk ALL payload chunks in the .onemodel file and decrypt them statically.
|
| 2 |
+
Full cross-platform static decryptor - no DLL or Windows APIs needed.
|
| 3 |
+
"""
|
| 4 |
+
import struct
|
| 5 |
+
import hashlib
|
| 6 |
+
from Crypto.Cipher import AES
|
| 7 |
+
|
| 8 |
+
MASTER_KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 9 |
+
IV = b"Copyright @ OneO"
|
| 10 |
+
CONTAINER_MAGIC = bytes.fromhex("4a1a082b25000000")
|
| 11 |
+
|
| 12 |
+
def aes_cfb128_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
| 13 |
+
cipher = AES.new(key, AES.MODE_CFB, iv=iv, segment_size=128)
|
| 14 |
+
return cipher.decrypt(data)
|
| 15 |
+
|
| 16 |
+
with open("ocr_data/oneocr.onemodel", "rb") as f:
|
| 17 |
+
fdata = f.read()
|
| 18 |
+
|
| 19 |
+
# Parse file header
|
| 20 |
+
H = struct.unpack_from("<Q", fdata, 0)[0]
|
| 21 |
+
file_hash = fdata[8:24]
|
| 22 |
+
print(f"File size: {len(fdata):,} bytes")
|
| 23 |
+
print(f"Header value H: {H}")
|
| 24 |
+
print(f"DX encrypted size: {H-12}")
|
| 25 |
+
print(f"Payload start: {H+16}")
|
| 26 |
+
|
| 27 |
+
# Decrypt DX index
|
| 28 |
+
dx_key = hashlib.sha256(MASTER_KEY + file_hash).digest()
|
| 29 |
+
dx_enc = fdata[24:H+12]
|
| 30 |
+
dx = aes_cfb128_decrypt(dx_key, IV, dx_enc)
|
| 31 |
+
|
| 32 |
+
valid_size = struct.unpack_from("<Q", dx, 8)[0]
|
| 33 |
+
print(f"DX magic: {dx[:8]}")
|
| 34 |
+
print(f"DX valid size: {valid_size}")
|
| 35 |
+
|
| 36 |
+
# Decrypt config from DX
|
| 37 |
+
config_sha_input = dx[48:64] + dx[32:48] # sizes + checksum
|
| 38 |
+
config_key = hashlib.sha256(config_sha_input).digest()
|
| 39 |
+
config_s1 = struct.unpack_from("<Q", dx, 48)[0]
|
| 40 |
+
config_enc = dx[64:64+config_s1+8]
|
| 41 |
+
config_dec = aes_cfb128_decrypt(config_key, IV, config_enc)
|
| 42 |
+
print(f"Config decrypted: {len(config_dec)} bytes, magic match: {config_dec[:8] == CONTAINER_MAGIC}")
|
| 43 |
+
|
| 44 |
+
# Walk payload chunks
|
| 45 |
+
off = H + 16
|
| 46 |
+
chunk_idx = 0
|
| 47 |
+
chunks = []
|
| 48 |
+
|
| 49 |
+
while off + 32 <= len(fdata):
|
| 50 |
+
chk = fdata[off:off+16]
|
| 51 |
+
s1, s2 = struct.unpack_from("<QQ", fdata, off+16)
|
| 52 |
+
|
| 53 |
+
if s2 != s1 + 24 or s1 == 0 or s1 > len(fdata):
|
| 54 |
+
break
|
| 55 |
+
|
| 56 |
+
enc_size = s1 + 8
|
| 57 |
+
data_off = off + 32
|
| 58 |
+
|
| 59 |
+
if data_off + enc_size > len(fdata):
|
| 60 |
+
print(f"WARNING: chunk#{chunk_idx} extends past file end!")
|
| 61 |
+
break
|
| 62 |
+
|
| 63 |
+
# Derive per-chunk key
|
| 64 |
+
sha_input = fdata[off+16:off+32] + fdata[off:off+16] # sizes + checksum
|
| 65 |
+
chunk_key = hashlib.sha256(sha_input).digest()
|
| 66 |
+
|
| 67 |
+
# Decrypt
|
| 68 |
+
dec_data = aes_cfb128_decrypt(chunk_key, IV, fdata[data_off:data_off+enc_size])
|
| 69 |
+
|
| 70 |
+
magic_ok = dec_data[:8] == CONTAINER_MAGIC
|
| 71 |
+
payload = dec_data[8:] # strip container header
|
| 72 |
+
|
| 73 |
+
chunks.append({
|
| 74 |
+
"idx": chunk_idx,
|
| 75 |
+
"file_offset": off,
|
| 76 |
+
"data_offset": data_off,
|
| 77 |
+
"size1": s1,
|
| 78 |
+
"enc_size": enc_size,
|
| 79 |
+
"magic_ok": magic_ok,
|
| 80 |
+
"payload": payload,
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
print(f" chunk#{chunk_idx:02d}: off={off:>10} s1={s1:>10} magic={'OK' if magic_ok else 'FAIL'} payload_start={payload[:8].hex()}")
|
| 84 |
+
|
| 85 |
+
off = data_off + enc_size
|
| 86 |
+
chunk_idx += 1
|
| 87 |
+
|
| 88 |
+
print(f"\nTotal chunks: {chunk_idx}")
|
| 89 |
+
print(f"File bytes remaining: {len(fdata) - off}")
|
| 90 |
+
print(f"All magic OK: {all(c['magic_ok'] for c in chunks)}")
|
| 91 |
+
|
| 92 |
+
# Identify ONNX models (start with protobuf field tags typical for ONNX ModelProto)
|
| 93 |
+
print("\n=== ONNX model identification ===")
|
| 94 |
+
onnx_count = 0
|
| 95 |
+
for c in chunks:
|
| 96 |
+
payload = c["payload"]
|
| 97 |
+
# ONNX ModelProto fields: 1(ir_version), 2(opset_import), 3(producer_name), etc.
|
| 98 |
+
# Field 1 varint starts with 0x08
|
| 99 |
+
# Actually check for ONNX-specific protobuf pattern
|
| 100 |
+
is_onnx = False
|
| 101 |
+
if len(payload) > 100:
|
| 102 |
+
# Check for typical ONNX patterns
|
| 103 |
+
if payload[0] == 0x08 and payload[1] in (0x06, 0x07): # ir_version 6 or 7
|
| 104 |
+
is_onnx = True
|
| 105 |
+
|
| 106 |
+
if is_onnx:
|
| 107 |
+
onnx_count += 1
|
| 108 |
+
print(f" chunk#{c['idx']:02d}: ONNX model, size={len(payload):,} bytes")
|
| 109 |
+
|
| 110 |
+
print(f"\nTotal ONNX models found: {onnx_count}")
|
| 111 |
+
print(f"Total non-ONNX chunks: {chunk_idx - onnx_count}")
|
| 112 |
+
|
| 113 |
+
# Show what non-ONNX chunks look like
|
| 114 |
+
print("\n=== Non-ONNX chunk types ===")
|
| 115 |
+
for c in chunks:
|
| 116 |
+
payload = c["payload"]
|
| 117 |
+
if len(payload) < 100 or payload[0] != 0x08 or payload[1] not in (0x06, 0x07):
|
| 118 |
+
# Try ASCII
|
| 119 |
+
try:
|
| 120 |
+
s = payload[:40].decode('ascii')
|
| 121 |
+
readable = all(ch.isprintable() or ch in '\n\r\t' for ch in s)
|
| 122 |
+
except:
|
| 123 |
+
readable = False
|
| 124 |
+
|
| 125 |
+
if readable:
|
| 126 |
+
preview = payload[:60].decode('ascii', errors='replace').replace('\n', '\\n')
|
| 127 |
+
else:
|
| 128 |
+
preview = payload[:32].hex()
|
| 129 |
+
print(f" chunk#{c['idx']:02d}: size={len(payload):>8,} type={'text' if readable else 'binary'} preview={preview}")
|
_archive/analyze_lm_features.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Understand what the 21-dim input features are for LM models 11-32.
|
| 2 |
+
These models take data[1,21,1,1] → softmax[1,2] (binary classifier).
|
| 3 |
+
We need to figure out what 21 features to compute from the recognizer output."""
|
| 4 |
+
import onnx
|
| 5 |
+
from onnx import numpy_helper
|
| 6 |
+
import numpy as np
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import onnxruntime as ort
|
| 9 |
+
|
| 10 |
+
# The 21 input features likely come from CTC recognizer statistics.
|
| 11 |
+
# Let's test with the unlocked models using some hypothetical feature vectors.
|
| 12 |
+
|
| 13 |
+
models_dir = Path("oneocr_extracted/onnx_models_unlocked")
|
| 14 |
+
|
| 15 |
+
# Load a LangSm model (model_11 = Latin LangSm)
|
| 16 |
+
sess_sm = ort.InferenceSession(str(list(models_dir.glob("model_11_*"))[0]))
|
| 17 |
+
# Load a LangMd model (model_22 = Latin LangMd)
|
| 18 |
+
sess_md = ort.InferenceSession(str(list(models_dir.glob("model_22_*"))[0]))
|
| 19 |
+
|
| 20 |
+
print("LangSm (model_11) inputs:", [(i.name, i.shape, i.type) for i in sess_sm.get_inputs()])
|
| 21 |
+
print("LangSm (model_11) outputs:", [(o.name, o.shape, o.type) for o in sess_sm.get_outputs()])
|
| 22 |
+
print()
|
| 23 |
+
print("LangMd (model_22) inputs:", [(i.name, i.shape, i.type) for i in sess_md.get_inputs()])
|
| 24 |
+
print("LangMd (model_22) outputs:", [(o.name, o.shape, o.type) for o in sess_md.get_outputs()])
|
| 25 |
+
|
| 26 |
+
# The normalization constants inside the model tell us about expected feature ranges
|
| 27 |
+
# From earlier analysis:
|
| 28 |
+
# Add constant: [-1.273, 0.396, 0.134, 0.151, 0.084, 0.346, 0.472, 0.435,
|
| 29 |
+
# 0.346, 0.581, 0.312, 0.036, 0.045, 0.033, 0.026, 0.022,
|
| 30 |
+
# 0.044, 0.038, 0.029, 0.031, 0.696]
|
| 31 |
+
# Div constant: [0.641, 0.914, 0.377, 0.399, 0.302, 0.657, 0.814, 0.769,
|
| 32 |
+
# 0.658, 0.878, 0.617, 0.153, 0.166, 0.137, 0.120, 0.108,
|
| 33 |
+
# 0.132, 0.115, 0.105, 0.108, 0.385]
|
| 34 |
+
#
|
| 35 |
+
# This means typical feature ranges are:
|
| 36 |
+
# feature[0]: mean = 1.273, std = 0.641 (large negative offset → feature is centered around 1.27)
|
| 37 |
+
# feature[20]: mean = -0.696, std = 0.385
|
| 38 |
+
#
|
| 39 |
+
# Features 0: Large range → possibly average log-probability or entropy
|
| 40 |
+
# Features 1-10: Medium range → possibly per-class probabilities or scores
|
| 41 |
+
# Features 11-20: Small range → possibly confidence statistics
|
| 42 |
+
|
| 43 |
+
# Let's check: extract normalization params from model_11
|
| 44 |
+
model_11 = onnx.load(str(list(Path("oneocr_extracted/onnx_models").glob("model_11_*"))[0]))
|
| 45 |
+
|
| 46 |
+
for node in model_11.graph.node:
|
| 47 |
+
if node.op_type == "Constant":
|
| 48 |
+
name = node.output[0]
|
| 49 |
+
if name in ['26', '28']: # Add and Div constants
|
| 50 |
+
for attr in node.attribute:
|
| 51 |
+
if attr.type == 4:
|
| 52 |
+
data = numpy_helper.to_array(attr.t)
|
| 53 |
+
label = "Add (=-mean)" if name == '26' else "Div (=std)"
|
| 54 |
+
print(f"\n{label}: {data.flatten()}")
|
| 55 |
+
# The mean tells us the expected center of each feature
|
| 56 |
+
if name == '26':
|
| 57 |
+
# mean = -add_const
|
| 58 |
+
means = -data.flatten()
|
| 59 |
+
print(f" Implied means: {means}")
|
| 60 |
+
|
| 61 |
+
# Hypothesis: The 21 features are CTC decoder statistics:
|
| 62 |
+
# Based on the normalization centers (means):
|
| 63 |
+
# feat[0]: ~1.27 — could be average negative log-likelihood (NLL) per character
|
| 64 |
+
# feat[1]: ~-0.40 — could be a score
|
| 65 |
+
# feat[2-10]: ~0-0.5 — could be per-script probabilities from ScriptID
|
| 66 |
+
# feat[11-20]: ~0-0.04 — could be character-level statistics
|
| 67 |
+
|
| 68 |
+
# Let's test what outputs the recognizer produces
|
| 69 |
+
rec_path = list(Path("oneocr_extracted/onnx_models").glob("model_02_*"))[0]
|
| 70 |
+
rec_sess = ort.InferenceSession(str(rec_path))
|
| 71 |
+
print(f"\nRecognizer (model_02) outputs:")
|
| 72 |
+
for o in rec_sess.get_outputs():
|
| 73 |
+
print(f" {o.name}: {o.shape}")
|
| 74 |
+
|
| 75 |
+
# Try running recognizer and computing statistics
|
| 76 |
+
test_data = np.random.randn(1, 3, 60, 200).astype(np.float32) * 0.1
|
| 77 |
+
seq_lengths = np.array([50], dtype=np.int32) # 200/4
|
| 78 |
+
result = rec_sess.run(None, {"data": test_data, "seq_lengths": seq_lengths})
|
| 79 |
+
logprobs = result[0]
|
| 80 |
+
print(f"\nRecognizer output: {logprobs.shape}")
|
| 81 |
+
print(f" Log-prob range: [{logprobs.min():.4f}, {logprobs.max():.4f}]")
|
| 82 |
+
|
| 83 |
+
# Compute possible features from recognizer output:
|
| 84 |
+
lp = logprobs[:, 0, :] # [T, num_classes]
|
| 85 |
+
best_probs = np.exp(lp.max(axis=-1)) # Best probability per frame
|
| 86 |
+
mean_best = best_probs.mean()
|
| 87 |
+
print(f"\n Mean best prob per frame: {mean_best:.4f}")
|
| 88 |
+
print(f" Mean log-prob max: {lp.max(axis=-1).mean():.4f}")
|
| 89 |
+
print(f" Entropy per frame: {(-np.exp(lp) * lp).sum(axis=-1).mean():.4f}")
|
| 90 |
+
|
| 91 |
+
# The 21 features might be computed as:
|
| 92 |
+
# feat[0] = average log-probability (NLL) → how confident the model is
|
| 93 |
+
# feat[1..K] = character frequency statistics
|
| 94 |
+
# feat[K+1..20] = transition statistics
|
| 95 |
+
#
|
| 96 |
+
# Without the exact feature computation code from the DLL, we'll need to
|
| 97 |
+
# reverse-engineer or approximate the feature vector.
|
| 98 |
+
|
| 99 |
+
# For now, test the LM models with various feature values
|
| 100 |
+
print(f"\n--- Testing LM models with various inputs ---")
|
| 101 |
+
for name, features in [
|
| 102 |
+
("all_zeros", np.zeros(21)),
|
| 103 |
+
("high_conf", np.array([0.0, 0.5, 0.9, 0.9, 0.9, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1.0])),
|
| 104 |
+
("low_conf", np.array([3.0, -0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1])),
|
| 105 |
+
("typical", np.array([1.2, -0.4, 0.1, 0.15, 0.08, 0.35, 0.47, 0.43, 0.35, 0.58, 0.31, 0.04, 0.05, 0.03, 0.03, 0.02, 0.04, 0.04, 0.03, 0.03, 0.7])),
|
| 106 |
+
]:
|
| 107 |
+
data = features.astype(np.float32).reshape(1, 21, 1, 1)
|
| 108 |
+
sm_out = sess_sm.run(None, {"data": data})[0]
|
| 109 |
+
md_out = sess_md.run(None, {"data": data})[0]
|
| 110 |
+
print(f" {name:12s}: LangSm={sm_out.flatten()}, LangMd={md_out.flatten()}")
|
_archive/analyze_models.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze all extracted ONNX models — inputs, outputs, ops, runtime compatibility."""
|
| 2 |
+
import onnx
|
| 3 |
+
import onnxruntime as ort
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 7 |
+
|
| 8 |
+
print("=" * 120)
|
| 9 |
+
print(f"{'#':>3} {'Name':40s} {'KB':>7} {'IR':>3} {'Producer':15s} {'Nodes':>5} {'Inputs':35s} {'Outputs':25s} {'RT':10s} Custom Ops")
|
| 10 |
+
print("=" * 120)
|
| 11 |
+
|
| 12 |
+
for f in sorted(models_dir.glob("*.onnx")):
|
| 13 |
+
m = onnx.load(str(f))
|
| 14 |
+
idx = f.name.split("_")[1]
|
| 15 |
+
ir = m.ir_version
|
| 16 |
+
prod = (m.producer_name or "?")[:15]
|
| 17 |
+
nodes = len(m.graph.node)
|
| 18 |
+
|
| 19 |
+
# Input shapes
|
| 20 |
+
inputs = []
|
| 21 |
+
for i in m.graph.input:
|
| 22 |
+
dims = []
|
| 23 |
+
if i.type.tensor_type.HasField("shape"):
|
| 24 |
+
for d in i.type.tensor_type.shape.dim:
|
| 25 |
+
dims.append(str(d.dim_value) if d.dim_value else d.dim_param or "?")
|
| 26 |
+
inputs.append(f"{i.name}[{','.join(dims)}]")
|
| 27 |
+
|
| 28 |
+
# Output shapes
|
| 29 |
+
outputs = []
|
| 30 |
+
for o in m.graph.output:
|
| 31 |
+
dims = []
|
| 32 |
+
if o.type.tensor_type.HasField("shape"):
|
| 33 |
+
for d in o.type.tensor_type.shape.dim:
|
| 34 |
+
dims.append(str(d.dim_value) if d.dim_value else d.dim_param or "?")
|
| 35 |
+
outputs.append(f"{o.name}[{','.join(dims)}]")
|
| 36 |
+
|
| 37 |
+
inp_str = "; ".join(inputs)[:35]
|
| 38 |
+
out_str = "; ".join(outputs)[:25]
|
| 39 |
+
|
| 40 |
+
# Custom ops
|
| 41 |
+
opsets = [o.domain for o in m.opset_import if o.domain]
|
| 42 |
+
custom = ", ".join(opsets) if opsets else "-"
|
| 43 |
+
|
| 44 |
+
# Runtime check
|
| 45 |
+
try:
|
| 46 |
+
sess = ort.InferenceSession(str(f), providers=["CPUExecutionProvider"])
|
| 47 |
+
rt = "OK"
|
| 48 |
+
except Exception as e:
|
| 49 |
+
rt = "CUSTOM"
|
| 50 |
+
|
| 51 |
+
size_kb = f.stat().st_size // 1024
|
| 52 |
+
print(f"{idx:>3} {f.stem:40s} {size_kb:>7} {ir:>3} {prod:15s} {nodes:>5} {inp_str:35s} {out_str:25s} {rt:10s} {custom}")
|
| 53 |
+
|
| 54 |
+
# Summary
|
| 55 |
+
print("\n=== OCR Pipeline Architecture ===")
|
| 56 |
+
print("""
|
| 57 |
+
OneOCR uses a MULTI-MODEL pipeline (not a single model):
|
| 58 |
+
|
| 59 |
+
1. DETECTOR (model_03, 13MB) — text detection in image
|
| 60 |
+
- Input: image tensor → Output: bounding boxes of text regions
|
| 61 |
+
|
| 62 |
+
2. CHARACTER RECOGNIZERS (model_00..10, 33) — per-script recognition
|
| 63 |
+
- Each script (Latin, Arabic, CJK, Cyrillic, etc.) has its own recognizer
|
| 64 |
+
- Input: cropped text region → Output: character probabilities
|
| 65 |
+
- Accompanied by: rnn.info, char2ind.txt, char2inschar.txt files
|
| 66 |
+
|
| 67 |
+
3. SMALL LANGUAGE MODELS (model_11..32, 26-28KB each)
|
| 68 |
+
- Post-processing character-level language models
|
| 69 |
+
- One per supported script/language
|
| 70 |
+
|
| 71 |
+
Problem for cross-platform:
|
| 72 |
+
- 23 models use custom op domain 'com.microsoft.oneocr'
|
| 73 |
+
- Custom ops like OneOCRFeatureExtract, DynamicQuantizeLSTM
|
| 74 |
+
- These are ONLY implemented in oneocr.dll (Windows)
|
| 75 |
+
- To run on Linux: need to reimplement these custom ops or find alternatives
|
| 76 |
+
""")
|
| 77 |
+
|
| 78 |
+
# Show config structure
|
| 79 |
+
print("=== Config Files (per-recognizer) ===")
|
| 80 |
+
config_dir = Path("oneocr_extracted/config_data")
|
| 81 |
+
config = (config_dir / "chunk_66_ocr_config.config.txt").read_text(errors="replace")
|
| 82 |
+
print(config[:500])
|
_archive/analyze_pipeline.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Full analysis of detector and scriptID models."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
def print_io(model_path, label):
|
| 7 |
+
m = onnx.load(str(model_path))
|
| 8 |
+
print(f'\n=== {label} ({Path(model_path).name}) ===')
|
| 9 |
+
print(f'Nodes: {len(m.graph.node)}')
|
| 10 |
+
|
| 11 |
+
print('Inputs:')
|
| 12 |
+
for i in m.graph.input:
|
| 13 |
+
dims = []
|
| 14 |
+
for d in i.type.tensor_type.shape.dim:
|
| 15 |
+
dims.append(str(d.dim_value) if d.dim_value else d.dim_param or '?')
|
| 16 |
+
print(f' {i.name}: [{", ".join(dims)}] dtype={i.type.tensor_type.elem_type}')
|
| 17 |
+
|
| 18 |
+
print('Outputs:')
|
| 19 |
+
for o in m.graph.output:
|
| 20 |
+
dims = []
|
| 21 |
+
for d in o.type.tensor_type.shape.dim:
|
| 22 |
+
dims.append(str(d.dim_value) if d.dim_value else d.dim_param or '?')
|
| 23 |
+
print(f' {o.name}: [{", ".join(dims)}] dtype={o.type.tensor_type.elem_type}')
|
| 24 |
+
|
| 25 |
+
custom = set()
|
| 26 |
+
for n in m.graph.node:
|
| 27 |
+
if n.domain:
|
| 28 |
+
custom.add((n.domain, n.op_type))
|
| 29 |
+
if custom:
|
| 30 |
+
print(f'Custom ops: {custom}')
|
| 31 |
+
else:
|
| 32 |
+
print('Custom ops: none')
|
| 33 |
+
return m
|
| 34 |
+
|
| 35 |
+
models_dir = Path('oneocr_extracted/onnx_models')
|
| 36 |
+
|
| 37 |
+
# Detector
|
| 38 |
+
m0 = print_io(next(models_dir.glob('model_00_*')), 'DETECTOR')
|
| 39 |
+
|
| 40 |
+
# ScriptID
|
| 41 |
+
m1 = print_io(next(models_dir.glob('model_01_*')), 'SCRIPT ID')
|
| 42 |
+
|
| 43 |
+
# A recognizer (Latin)
|
| 44 |
+
m2 = print_io(next(models_dir.glob('model_02_*')), 'RECOGNIZER Latin')
|
| 45 |
+
|
| 46 |
+
# Try running detector to see actual output shapes
|
| 47 |
+
import onnxruntime as ort
|
| 48 |
+
from PIL import Image
|
| 49 |
+
|
| 50 |
+
img = Image.open('image.png').convert('RGB')
|
| 51 |
+
w, h = img.size
|
| 52 |
+
|
| 53 |
+
sess = ort.InferenceSession(str(next(models_dir.glob('model_00_*'))),
|
| 54 |
+
providers=['CPUExecutionProvider'])
|
| 55 |
+
|
| 56 |
+
scale = 800 / max(h, w)
|
| 57 |
+
dh = (int(h * scale) + 31) // 32 * 32
|
| 58 |
+
dw = (int(w * scale) + 31) // 32 * 32
|
| 59 |
+
|
| 60 |
+
img_d = img.resize((dw, dh), Image.LANCZOS)
|
| 61 |
+
arr_d = np.array(img_d, dtype=np.float32)
|
| 62 |
+
arr_d = arr_d[:, :, ::-1] - [102.9801, 115.9465, 122.7717]
|
| 63 |
+
data_d = arr_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
|
| 64 |
+
im_info = np.array([[dh, dw, scale]], dtype=np.float32)
|
| 65 |
+
|
| 66 |
+
outputs = sess.run(None, {"data": data_d, "im_info": im_info})
|
| 67 |
+
print(f'\n=== DETECTOR OUTPUT SHAPES (image {w}x{h} -> {dw}x{dh}) ===')
|
| 68 |
+
output_names = [o.name for o in sess.get_outputs()]
|
| 69 |
+
for name, out in zip(output_names, outputs):
|
| 70 |
+
print(f' {name}: shape={out.shape} dtype={out.dtype} min={out.min():.4f} max={out.max():.4f}')
|
| 71 |
+
|
| 72 |
+
# Specifically analyze pixel_link outputs
|
| 73 |
+
# PixelLink has: pixel scores (text/non-text) + link scores (8 neighbors)
|
| 74 |
+
# FPN produces 3 scales
|
| 75 |
+
print('\n=== DETECTOR OUTPUT ANALYSIS ===')
|
| 76 |
+
for i, (name, out) in enumerate(zip(output_names, outputs)):
|
| 77 |
+
scores = 1.0 / (1.0 + np.exp(-out)) # sigmoid
|
| 78 |
+
hot = (scores > 0.5).sum()
|
| 79 |
+
print(f' [{i:2d}] {name:25s} shape={str(out.shape):20s} sigmoid_max={scores.max():.4f} hot_pixels(>0.5)={hot}')
|
_archive/attempts/bcrypt_decrypt.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OneOCR .onemodel decryption using Windows BCrypt CNG API directly.
|
| 3 |
+
Replicates the exact behavior of oneocr.dll's Crypto.cpp.
|
| 4 |
+
|
| 5 |
+
Known from DLL analysis:
|
| 6 |
+
- BCryptOpenAlgorithmProvider with L"AES"
|
| 7 |
+
- BCryptSetProperty L"ChainingMode" = L"ChainingModeCFB"
|
| 8 |
+
- BCryptGetProperty L"BlockLength" (→ 16)
|
| 9 |
+
- BCryptSetProperty L"MessageBlockLength" = 16 (→ CFB128)
|
| 10 |
+
- BCryptGenerateSymmetricKey with raw key bytes
|
| 11 |
+
- BCryptDecrypt
|
| 12 |
+
- SHA256Hash function exists (uses BCryptCreateHash/BCryptHashData/BCryptFinishHash)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import ctypes
|
| 16 |
+
import ctypes.wintypes as wintypes
|
| 17 |
+
import struct
|
| 18 |
+
import hashlib
|
| 19 |
+
import zlib
|
| 20 |
+
from collections import Counter
|
| 21 |
+
import math
|
| 22 |
+
import os
|
| 23 |
+
|
| 24 |
+
# ═══════════════════════════════════════════════════════════════
|
| 25 |
+
# Windows BCrypt API via ctypes
|
| 26 |
+
# ═══════════════════════════════════════════════════════════════
|
| 27 |
+
|
| 28 |
+
bcrypt = ctypes.WinDLL("bcrypt")
|
| 29 |
+
|
| 30 |
+
BCRYPT_ALG_HANDLE = ctypes.c_void_p
|
| 31 |
+
BCRYPT_KEY_HANDLE = ctypes.c_void_p
|
| 32 |
+
NTSTATUS = ctypes.c_long
|
| 33 |
+
|
| 34 |
+
# Constants
|
| 35 |
+
BCRYPT_AES_ALGORITHM = "AES"
|
| 36 |
+
BCRYPT_SHA256_ALGORITHM = "SHA256"
|
| 37 |
+
BCRYPT_CHAINING_MODE = "ChainingMode"
|
| 38 |
+
BCRYPT_CHAIN_MODE_CFB = "ChainingModeCFB"
|
| 39 |
+
BCRYPT_BLOCK_LENGTH = "BlockLength"
|
| 40 |
+
BCRYPT_MESSAGE_BLOCK_LENGTH = "MessageBlockLength"
|
| 41 |
+
|
| 42 |
+
def check_status(status, msg=""):
|
| 43 |
+
if status != 0:
|
| 44 |
+
raise OSError(f"BCrypt error 0x{status & 0xFFFFFFFF:08x}: {msg}")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def bcrypt_sha256(data: bytes) -> bytes:
|
| 48 |
+
"""Compute SHA256 using Windows BCrypt API."""
|
| 49 |
+
hAlg = BCRYPT_ALG_HANDLE()
|
| 50 |
+
status = bcrypt.BCryptOpenAlgorithmProvider(
|
| 51 |
+
ctypes.byref(hAlg),
|
| 52 |
+
ctypes.c_wchar_p(BCRYPT_SHA256_ALGORITHM),
|
| 53 |
+
None, 0)
|
| 54 |
+
check_status(status, "SHA256 OpenAlgorithmProvider")
|
| 55 |
+
|
| 56 |
+
hHash = ctypes.c_void_p()
|
| 57 |
+
status = bcrypt.BCryptCreateHash(hAlg, ctypes.byref(hHash), None, 0, None, 0, 0)
|
| 58 |
+
check_status(status, "CreateHash")
|
| 59 |
+
|
| 60 |
+
status = bcrypt.BCryptHashData(hHash, data, len(data), 0)
|
| 61 |
+
check_status(status, "HashData")
|
| 62 |
+
|
| 63 |
+
hash_out = (ctypes.c_ubyte * 32)()
|
| 64 |
+
status = bcrypt.BCryptFinishHash(hHash, hash_out, 32, 0)
|
| 65 |
+
check_status(status, "FinishHash")
|
| 66 |
+
|
| 67 |
+
bcrypt.BCryptDestroyHash(hHash)
|
| 68 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 69 |
+
|
| 70 |
+
return bytes(hash_out)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def bcrypt_aes_cfb_decrypt(ciphertext: bytes, key: bytes, iv: bytes,
|
| 74 |
+
message_block_length: int = 16) -> bytes:
|
| 75 |
+
"""Decrypt using AES-CFB via Windows BCrypt CNG API.
|
| 76 |
+
|
| 77 |
+
message_block_length: 1 for CFB8, 16 for CFB128
|
| 78 |
+
"""
|
| 79 |
+
hAlg = BCRYPT_ALG_HANDLE()
|
| 80 |
+
status = bcrypt.BCryptOpenAlgorithmProvider(
|
| 81 |
+
ctypes.byref(hAlg),
|
| 82 |
+
ctypes.c_wchar_p(BCRYPT_AES_ALGORITHM),
|
| 83 |
+
None, 0)
|
| 84 |
+
check_status(status, "AES OpenAlgorithmProvider")
|
| 85 |
+
|
| 86 |
+
# Set chaining mode to CFB
|
| 87 |
+
mode_str = BCRYPT_CHAIN_MODE_CFB
|
| 88 |
+
mode_buf = ctypes.create_unicode_buffer(mode_str)
|
| 89 |
+
mode_size = (len(mode_str) + 1) * 2 # UTF-16 with null terminator
|
| 90 |
+
status = bcrypt.BCryptSetProperty(
|
| 91 |
+
hAlg,
|
| 92 |
+
ctypes.c_wchar_p(BCRYPT_CHAINING_MODE),
|
| 93 |
+
mode_buf, mode_size, 0)
|
| 94 |
+
check_status(status, "SetProperty ChainingMode")
|
| 95 |
+
|
| 96 |
+
# Set message block length (feedback size)
|
| 97 |
+
mbl = ctypes.c_ulong(message_block_length)
|
| 98 |
+
status = bcrypt.BCryptSetProperty(
|
| 99 |
+
hAlg,
|
| 100 |
+
ctypes.c_wchar_p(BCRYPT_MESSAGE_BLOCK_LENGTH),
|
| 101 |
+
ctypes.byref(mbl), ctypes.sizeof(mbl), 0)
|
| 102 |
+
check_status(status, f"SetProperty MessageBlockLength={message_block_length}")
|
| 103 |
+
|
| 104 |
+
# Generate symmetric key
|
| 105 |
+
hKey = BCRYPT_KEY_HANDLE()
|
| 106 |
+
key_buf = (ctypes.c_ubyte * len(key))(*key)
|
| 107 |
+
status = bcrypt.BCryptGenerateSymmetricKey(
|
| 108 |
+
hAlg, ctypes.byref(hKey), None, 0, key_buf, len(key), 0)
|
| 109 |
+
check_status(status, "GenerateSymmetricKey")
|
| 110 |
+
|
| 111 |
+
# Prepare IV (BCrypt modifies it during decryption, so use a copy)
|
| 112 |
+
iv_buf = (ctypes.c_ubyte * 16)(*iv)
|
| 113 |
+
|
| 114 |
+
# Prepare input/output buffers
|
| 115 |
+
ct_buf = (ctypes.c_ubyte * len(ciphertext))(*ciphertext)
|
| 116 |
+
pt_buf = (ctypes.c_ubyte * len(ciphertext))()
|
| 117 |
+
result_len = ctypes.c_ulong(0)
|
| 118 |
+
|
| 119 |
+
# Decrypt
|
| 120 |
+
status = bcrypt.BCryptDecrypt(
|
| 121 |
+
hKey, ct_buf, len(ciphertext), None,
|
| 122 |
+
iv_buf, 16,
|
| 123 |
+
pt_buf, len(ciphertext),
|
| 124 |
+
ctypes.byref(result_len), 0)
|
| 125 |
+
check_status(status, "BCryptDecrypt")
|
| 126 |
+
|
| 127 |
+
# Cleanup
|
| 128 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 129 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 130 |
+
|
| 131 |
+
return bytes(pt_buf[:result_len.value])
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def entropy(data: bytes) -> float:
|
| 135 |
+
"""Shannon entropy (bits per byte)."""
|
| 136 |
+
if not data:
|
| 137 |
+
return 0.0
|
| 138 |
+
freq = Counter(data)
|
| 139 |
+
total = len(data)
|
| 140 |
+
return -sum((c / total) * math.log2(c / total) for c in freq.values())
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def hex_dump(data: bytes, offset: int = 0, max_lines: int = 8) -> str:
|
| 144 |
+
lines = []
|
| 145 |
+
for i in range(0, min(len(data), max_lines * 16), 16):
|
| 146 |
+
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
|
| 147 |
+
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
|
| 148 |
+
lines.append(f" {offset+i:08x}: {hex_part:<48s} {ascii_part}")
|
| 149 |
+
return "\n".join(lines)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def check_decrypted(data: bytes, label: str) -> bool:
|
| 153 |
+
"""Check if decrypted data looks valid. Return True if promising."""
|
| 154 |
+
if not data or len(data) < 16:
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
ent = entropy(data[:min(4096, len(data))])
|
| 158 |
+
u32_le = struct.unpack_from("<I", data, 0)[0]
|
| 159 |
+
|
| 160 |
+
# Check for magic_number = 1
|
| 161 |
+
magic_match = (u32_le == 1)
|
| 162 |
+
|
| 163 |
+
# Check for protobuf
|
| 164 |
+
protobuf = data[0] == 0x08 or data[0] == 0x0a
|
| 165 |
+
|
| 166 |
+
# Check for compression headers
|
| 167 |
+
zlib_header = data[:2] in [b"\x78\x01", b"\x78\x5e", b"\x78\x9c", b"\x78\xda"]
|
| 168 |
+
gzip_header = data[:2] == b"\x1f\x8b"
|
| 169 |
+
lz4_header = data[:4] == b"\x04\x22\x4d\x18"
|
| 170 |
+
|
| 171 |
+
is_promising = magic_match or (ent < 7.0) or zlib_header or gzip_header or lz4_header
|
| 172 |
+
|
| 173 |
+
if is_promising or protobuf:
|
| 174 |
+
print(f"\n ★★★ {'MAGIC=1 !!!' if magic_match else 'Promising'}: {label}")
|
| 175 |
+
print(f" Entropy: {ent:.3f}, uint32_LE[0]={u32_le}, first_byte=0x{data[0]:02x}")
|
| 176 |
+
print(f" First 128 bytes:")
|
| 177 |
+
print(hex_dump(data[:128]))
|
| 178 |
+
if zlib_header:
|
| 179 |
+
print(f" → ZLIB header detected!")
|
| 180 |
+
if gzip_header:
|
| 181 |
+
print(f" → GZIP header detected!")
|
| 182 |
+
if lz4_header:
|
| 183 |
+
print(f" → LZ4 header detected!")
|
| 184 |
+
if magic_match:
|
| 185 |
+
print(f" → MAGIC_NUMBER = 1 !! This is likely correct decryption!")
|
| 186 |
+
# Try decompression after offset 4 or later
|
| 187 |
+
for skip in [0, 4, 8, 16, 32, 64]:
|
| 188 |
+
chunk = data[skip:skip+min(10000, len(data)-skip)]
|
| 189 |
+
try:
|
| 190 |
+
dec = zlib.decompress(chunk)
|
| 191 |
+
print(f" → ZLIB decompress SUCCESS at skip={skip}: {len(dec)} bytes!")
|
| 192 |
+
print(f" First 64: {dec[:64].hex()}")
|
| 193 |
+
return True
|
| 194 |
+
except:
|
| 195 |
+
pass
|
| 196 |
+
try:
|
| 197 |
+
dec = zlib.decompress(chunk, -15)
|
| 198 |
+
print(f" → Raw DEFLATE decompress SUCCESS at skip={skip}: {len(dec)} bytes!")
|
| 199 |
+
print(f" First 64: {dec[:64].hex()}")
|
| 200 |
+
return True
|
| 201 |
+
except:
|
| 202 |
+
pass
|
| 203 |
+
return True
|
| 204 |
+
return False
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ═══════════════════════════════════════════════════════════════
|
| 208 |
+
# MAIN
|
| 209 |
+
# ═══════════════════════════════════════════════════════════════
|
| 210 |
+
|
| 211 |
+
MODEL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
|
| 212 |
+
KEY_RAW = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 213 |
+
KEY_SHA256 = bcrypt_sha256(KEY_RAW)
|
| 214 |
+
|
| 215 |
+
print("=" * 80)
|
| 216 |
+
print("OneOCR Decryption via Windows BCrypt CNG API")
|
| 217 |
+
print("=" * 80)
|
| 218 |
+
|
| 219 |
+
print(f"\nKey (raw): {KEY_RAW.hex()}")
|
| 220 |
+
print(f"Key (SHA256): {KEY_SHA256.hex()}")
|
| 221 |
+
print(f"Python hashlib SHA256: {hashlib.sha256(KEY_RAW).digest().hex()}")
|
| 222 |
+
print(f"BCrypt SHA256 match: {KEY_SHA256 == hashlib.sha256(KEY_RAW).digest()}")
|
| 223 |
+
|
| 224 |
+
# Read file
|
| 225 |
+
with open(MODEL_PATH, "rb") as f:
|
| 226 |
+
full_data = f.read()
|
| 227 |
+
filesize = len(full_data)
|
| 228 |
+
|
| 229 |
+
header_offset = struct.unpack_from("<I", full_data, 0)[0] # 22636
|
| 230 |
+
payload_size = struct.unpack_from("<Q", full_data, header_offset + 8)[0] # 58431147
|
| 231 |
+
payload_start = header_offset + 16 # 22652
|
| 232 |
+
|
| 233 |
+
print(f"\nFile size: {filesize:,}")
|
| 234 |
+
print(f"Header offset: {header_offset}")
|
| 235 |
+
print(f"Payload size: {payload_size:,}")
|
| 236 |
+
print(f"Payload start: {payload_start}")
|
| 237 |
+
|
| 238 |
+
# ═══════════════════════════════════════════════════════════════
|
| 239 |
+
# Test 1: Try standard combinations with BCrypt API
|
| 240 |
+
# ═══════════════════════════════════════════════════════════════
|
| 241 |
+
|
| 242 |
+
print("\n" + "=" * 80)
|
| 243 |
+
print("TEST 1: Standard combinations via BCrypt CFB128")
|
| 244 |
+
print("=" * 80)
|
| 245 |
+
|
| 246 |
+
iv_zero = b"\x00" * 16
|
| 247 |
+
iv_candidates = {
|
| 248 |
+
"zeros": iv_zero,
|
| 249 |
+
"file[8:24]": full_data[8:24],
|
| 250 |
+
"file[4:20]": full_data[4:20],
|
| 251 |
+
"file[0:16]": full_data[0:16],
|
| 252 |
+
f"file[{header_offset}:{header_offset+16}]": full_data[header_offset:header_offset+16],
|
| 253 |
+
f"file[{payload_start}:{payload_start+16}]": full_data[payload_start:payload_start+16],
|
| 254 |
+
"SHA256(key)[:16]": KEY_SHA256[:16],
|
| 255 |
+
"SHA256(key)[16:]": KEY_SHA256[16:],
|
| 256 |
+
"key_raw[:16]": KEY_RAW[:16],
|
| 257 |
+
"key_raw[16:]": KEY_RAW[16:],
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
key_candidates = {
|
| 261 |
+
"raw": KEY_RAW,
|
| 262 |
+
"SHA256": KEY_SHA256,
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
data_regions = {
|
| 266 |
+
"header[8:]": full_data[8:8+4096],
|
| 267 |
+
f"payload[{payload_start}:]": full_data[payload_start:payload_start+4096],
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
for mbl in [16, 1]: # CFB128 first (most likely), then CFB8
|
| 271 |
+
for key_name, key in key_candidates.items():
|
| 272 |
+
for iv_name, iv in iv_candidates.items():
|
| 273 |
+
for region_name, region_data in data_regions.items():
|
| 274 |
+
label = f"CFB{'128' if mbl == 16 else '8'} key={key_name} iv={iv_name} data={region_name}"
|
| 275 |
+
try:
|
| 276 |
+
dec = bcrypt_aes_cfb_decrypt(region_data, key, iv, mbl)
|
| 277 |
+
if check_decrypted(dec, label):
|
| 278 |
+
pass # Already printed
|
| 279 |
+
except Exception as e:
|
| 280 |
+
pass # Silently skip errors
|
| 281 |
+
|
| 282 |
+
# ═══════════════════════════════════════════════════════════════
|
| 283 |
+
# Test 2: Known-plaintext IV search
|
| 284 |
+
# ═══════════════════════════════════════════════════════════════
|
| 285 |
+
|
| 286 |
+
print("\n" + "=" * 80)
|
| 287 |
+
print("TEST 2: Known-plaintext IV search (magic_number=1)")
|
| 288 |
+
print("=" * 80)
|
| 289 |
+
print(" Searching for IV that produces magic_number=1 (0x01000000) at start...")
|
| 290 |
+
|
| 291 |
+
# For AES-CFB128, first block:
|
| 292 |
+
# plaintext[0:16] = AES_ECB_encrypt(IV, key) XOR ciphertext[0:16]
|
| 293 |
+
# We want plaintext[0:4] = 01 00 00 00 (LE)
|
| 294 |
+
# So: AES_ECB_encrypt(IV, key)[0:4] = ciphertext[0:4] XOR 01 00 00 00
|
| 295 |
+
|
| 296 |
+
# We can't easily predict AES output, so we try each IV candidate
|
| 297 |
+
# Try every 4-byte aligned position in header as IV, with both key candidates
|
| 298 |
+
|
| 299 |
+
found = False
|
| 300 |
+
for key_name, key in key_candidates.items():
|
| 301 |
+
for mbl in [16, 1]:
|
| 302 |
+
# Try IV from file at every 4-byte step in the first 22700 bytes
|
| 303 |
+
for iv_offset in range(0, min(22700, filesize - 16), 4):
|
| 304 |
+
iv = full_data[iv_offset:iv_offset + 16]
|
| 305 |
+
|
| 306 |
+
# Try decrypting header encrypted data (byte 8+)
|
| 307 |
+
ct = full_data[8:24] # Just decrypt first 16 bytes
|
| 308 |
+
try:
|
| 309 |
+
dec = bcrypt_aes_cfb_decrypt(ct, key, iv, mbl)
|
| 310 |
+
u32 = struct.unpack_from("<I", dec, 0)[0]
|
| 311 |
+
if u32 == 1:
|
| 312 |
+
print(f"\n ★★★ FOUND! magic_number=1 with iv_offset={iv_offset}, key={key_name}, CFB{'128' if mbl==16 else '8'}")
|
| 313 |
+
print(f" IV: {iv.hex()}")
|
| 314 |
+
print(f" Decrypted first 16 bytes: {dec[:16].hex()}")
|
| 315 |
+
# Decrypt more data
|
| 316 |
+
dec_full = bcrypt_aes_cfb_decrypt(full_data[8:8+4096], key, iv, mbl)
|
| 317 |
+
check_decrypted(dec_full, f"FULL header with iv_offset={iv_offset}")
|
| 318 |
+
found = True
|
| 319 |
+
except:
|
| 320 |
+
pass
|
| 321 |
+
|
| 322 |
+
# Try decrypting payload data
|
| 323 |
+
ct2 = full_data[payload_start:payload_start+16]
|
| 324 |
+
try:
|
| 325 |
+
dec2 = bcrypt_aes_cfb_decrypt(ct2, key, iv, mbl)
|
| 326 |
+
u32_2 = struct.unpack_from("<I", dec2, 0)[0]
|
| 327 |
+
if u32_2 == 1:
|
| 328 |
+
print(f"\n ★★★ FOUND! magic_number=1 with iv_offset={iv_offset}, key={key_name}, CFB{'128' if mbl==16 else '8'}")
|
| 329 |
+
print(f" IV: {iv.hex()}")
|
| 330 |
+
print(f" Decrypted first 16 bytes: {dec2[:16].hex()}")
|
| 331 |
+
# Decrypt more data
|
| 332 |
+
dec_full2 = bcrypt_aes_cfb_decrypt(full_data[payload_start:payload_start+4096], key, iv, mbl)
|
| 333 |
+
check_decrypted(dec_full2, f"FULL payload with iv_offset={iv_offset}")
|
| 334 |
+
found = True
|
| 335 |
+
except:
|
| 336 |
+
pass
|
| 337 |
+
|
| 338 |
+
if found:
|
| 339 |
+
break
|
| 340 |
+
if found:
|
| 341 |
+
break
|
| 342 |
+
|
| 343 |
+
if not found:
|
| 344 |
+
print(" No IV found in file that produces magic_number=1")
|
| 345 |
+
|
| 346 |
+
# ═══════════════════════════════════════════════════════════════
|
| 347 |
+
# Test 3: Try derived IVs not from file
|
| 348 |
+
# ═══════════════════════════════════════════════════════════════
|
| 349 |
+
|
| 350 |
+
print("\n" + "=" * 80)
|
| 351 |
+
print("TEST 3: Derived IV strategies via BCrypt")
|
| 352 |
+
print("=" * 80)
|
| 353 |
+
|
| 354 |
+
derived_ivs = {
|
| 355 |
+
"zeros": b"\x00" * 16,
|
| 356 |
+
"SHA256(key)[:16]": KEY_SHA256[:16],
|
| 357 |
+
"SHA256(key)[16:]": KEY_SHA256[16:],
|
| 358 |
+
"key[:16]": KEY_RAW[:16],
|
| 359 |
+
"key[16:]": KEY_RAW[16:],
|
| 360 |
+
"SHA256('')[:16]": hashlib.sha256(b"").digest()[:16],
|
| 361 |
+
"SHA256('\\0')[:16]": hashlib.sha256(b"\x00").digest()[:16],
|
| 362 |
+
"MD5(key)": hashlib.md5(KEY_RAW).digest(),
|
| 363 |
+
"SHA256('oneocr')[:16]": hashlib.sha256(b"oneocr").digest()[:16],
|
| 364 |
+
"SHA256(key+\\0)[:16]": hashlib.sha256(KEY_RAW + b"\x00").digest()[:16],
|
| 365 |
+
"SHA256(key_reversed)[:16]": hashlib.sha256(KEY_RAW[::-1]).digest()[:16],
|
| 366 |
+
"key XOR 0x36 [:16]": bytes(b ^ 0x36 for b in KEY_RAW[:16]), # HMAC ipad
|
| 367 |
+
"key XOR 0x5c [:16]": bytes(b ^ 0x5c for b in KEY_RAW[:16]), # HMAC opad
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
for iv_name, iv in derived_ivs.items():
|
| 371 |
+
for key_name, key in key_candidates.items():
|
| 372 |
+
for mbl in [16, 1]:
|
| 373 |
+
for region_name, ct in [("header[8:]", full_data[8:8+4096]),
|
| 374 |
+
(f"payload", full_data[payload_start:payload_start+4096])]:
|
| 375 |
+
try:
|
| 376 |
+
dec = bcrypt_aes_cfb_decrypt(ct, key, iv, mbl)
|
| 377 |
+
label = f"CFB{'128' if mbl==16 else '8'} key={key_name} iv={iv_name} data={region_name}"
|
| 378 |
+
check_decrypted(dec, label)
|
| 379 |
+
except:
|
| 380 |
+
pass
|
| 381 |
+
|
| 382 |
+
# ═══════════════════════════════════════════════════════════════
|
| 383 |
+
# Test 4: What if entire file from byte 0 is encrypted?
|
| 384 |
+
# ═══════════════════════════════════════════════════════════════
|
| 385 |
+
|
| 386 |
+
print("\n" + "=" * 80)
|
| 387 |
+
print("TEST 4: Entire file encrypted from byte 0")
|
| 388 |
+
print("=" * 80)
|
| 389 |
+
|
| 390 |
+
for key_name, key in key_candidates.items():
|
| 391 |
+
for mbl in [16, 1]:
|
| 392 |
+
for iv_name, iv in [("zeros", iv_zero), ("SHA256(key)[:16]", KEY_SHA256[:16]),
|
| 393 |
+
("key[:16]", KEY_RAW[:16])]:
|
| 394 |
+
try:
|
| 395 |
+
dec = bcrypt_aes_cfb_decrypt(full_data[:4096], key, iv, mbl)
|
| 396 |
+
label = f"CFB{'128' if mbl==16 else '8'} key={key_name} iv={iv_name} data=file[0:]"
|
| 397 |
+
check_decrypted(dec, label)
|
| 398 |
+
except:
|
| 399 |
+
pass
|
| 400 |
+
|
| 401 |
+
# ═══════════════════════════════════════════════════════════════
|
| 402 |
+
# Test 5: Decrypt with IV prepended to ciphertext in file
|
| 403 |
+
# ═══════════════════════════════════════════════════════════════
|
| 404 |
+
|
| 405 |
+
print("\n" + "=" * 80)
|
| 406 |
+
print("TEST 5: IV prepended to ciphertext at various offsets")
|
| 407 |
+
print("=" * 80)
|
| 408 |
+
|
| 409 |
+
for data_start in [0, 4, 8, 16, 24, header_offset, payload_start]:
|
| 410 |
+
iv_test = full_data[data_start:data_start+16]
|
| 411 |
+
ct_test = full_data[data_start+16:data_start+16+4096]
|
| 412 |
+
for key_name, key in key_candidates.items():
|
| 413 |
+
for mbl in [16, 1]:
|
| 414 |
+
try:
|
| 415 |
+
dec = bcrypt_aes_cfb_decrypt(ct_test, key, iv_test, mbl)
|
| 416 |
+
label = f"CFB{'128' if mbl==16 else '8'} key={key_name} IV=file[{data_start}:{data_start+16}] CT=file[{data_start+16}:]"
|
| 417 |
+
check_decrypted(dec, label)
|
| 418 |
+
except:
|
| 419 |
+
pass
|
| 420 |
+
|
| 421 |
+
print("\n" + "=" * 80)
|
| 422 |
+
print("DONE")
|
| 423 |
+
print("=" * 80)
|
_archive/attempts/create_test_image.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Create test image with text 'ONE OCR DZIAŁA!' for OCR testing."""
|
| 2 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 3 |
+
|
| 4 |
+
# Create white image
|
| 5 |
+
img = Image.new("RGB", (600, 150), color="white")
|
| 6 |
+
draw = ImageDraw.Draw(img)
|
| 7 |
+
|
| 8 |
+
# Try to use a good font, fallback to default
|
| 9 |
+
try:
|
| 10 |
+
font = ImageFont.truetype("arial.ttf", 48)
|
| 11 |
+
except OSError:
|
| 12 |
+
try:
|
| 13 |
+
font = ImageFont.truetype("C:/Windows/Fonts/arial.ttf", 48)
|
| 14 |
+
except OSError:
|
| 15 |
+
font = ImageFont.load_default()
|
| 16 |
+
|
| 17 |
+
# Draw black text
|
| 18 |
+
draw.text((30, 40), "ONE OCR DZIALA!", fill="black", font=font)
|
| 19 |
+
|
| 20 |
+
img.save("image.png")
|
| 21 |
+
print("Created image.png with text 'ONE OCR DZIALA!'")
|
_archive/attempts/decrypt_model.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OneOCR Model Extraction via Runtime Memory Dump.
|
| 3 |
+
|
| 4 |
+
Strategy: Load the OCR pipeline (which decrypts the model internally),
|
| 5 |
+
then scan our own process memory for ONNX/protobuf patterns and dump them.
|
| 6 |
+
|
| 7 |
+
Since oneocr.dll decrypts and decompresses models into memory during
|
| 8 |
+
CreateOcrPipeline, we can capture them by scanning process memory.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import ctypes
|
| 12 |
+
import ctypes.wintypes as wintypes
|
| 13 |
+
import struct
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import time
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from collections import Counter
|
| 19 |
+
import math
|
| 20 |
+
|
| 21 |
+
# ═══════════════════════════════════════════════════════════════
|
| 22 |
+
# Constants
|
| 23 |
+
# ═══════════════════════════════════════════════════════════════
|
| 24 |
+
|
| 25 |
+
OCR_DATA_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data")
|
| 26 |
+
DLL_PATH = str(OCR_DATA_DIR / "oneocr.dll")
|
| 27 |
+
ORT_DLL_PATH = str(OCR_DATA_DIR / "onnxruntime.dll")
|
| 28 |
+
MODEL_PATH = str(OCR_DATA_DIR / "oneocr.onemodel")
|
| 29 |
+
KEY = 'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 30 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\extracted_models")
|
| 31 |
+
|
| 32 |
+
# ═══════════════════════════════════════════════════════════════
|
| 33 |
+
# Windows API
|
| 34 |
+
# ═══════════════════════════════════════════════════════════════
|
| 35 |
+
|
| 36 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 37 |
+
|
| 38 |
+
class MEMORY_BASIC_INFORMATION(ctypes.Structure):
|
| 39 |
+
_fields_ = [
|
| 40 |
+
("BaseAddress", ctypes.c_void_p),
|
| 41 |
+
("AllocationBase", ctypes.c_void_p),
|
| 42 |
+
("AllocationProtect", wintypes.DWORD),
|
| 43 |
+
("RegionSize", ctypes.c_size_t),
|
| 44 |
+
("State", wintypes.DWORD),
|
| 45 |
+
("Protect", wintypes.DWORD),
|
| 46 |
+
("Type", wintypes.DWORD),
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
MEM_COMMIT = 0x1000
|
| 50 |
+
PAGE_NOACCESS = 0x01
|
| 51 |
+
PAGE_GUARD = 0x100
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def entropy(data: bytes) -> float:
|
| 55 |
+
if not data:
|
| 56 |
+
return 0.0
|
| 57 |
+
freq = Counter(data)
|
| 58 |
+
total = len(data)
|
| 59 |
+
return -sum((c / total) * math.log2(c / total) for c in freq.values())
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def scan_memory_regions():
|
| 63 |
+
"""Enumerate all committed, readable memory regions."""
|
| 64 |
+
regions = []
|
| 65 |
+
handle = kernel32.GetCurrentProcess()
|
| 66 |
+
mbi = MEMORY_BASIC_INFORMATION()
|
| 67 |
+
address = 0
|
| 68 |
+
max_addr = (1 << 47) - 1
|
| 69 |
+
|
| 70 |
+
while address < max_addr:
|
| 71 |
+
result = kernel32.VirtualQuery(
|
| 72 |
+
ctypes.c_void_p(address),
|
| 73 |
+
ctypes.byref(mbi),
|
| 74 |
+
ctypes.sizeof(mbi)
|
| 75 |
+
)
|
| 76 |
+
if result == 0:
|
| 77 |
+
break
|
| 78 |
+
|
| 79 |
+
base_addr = mbi.BaseAddress or 0
|
| 80 |
+
region_size = mbi.RegionSize or 0
|
| 81 |
+
|
| 82 |
+
if region_size == 0:
|
| 83 |
+
break
|
| 84 |
+
|
| 85 |
+
if (mbi.State == MEM_COMMIT and
|
| 86 |
+
mbi.Protect not in (0, PAGE_NOACCESS, PAGE_GUARD) and
|
| 87 |
+
not (mbi.Protect & PAGE_GUARD)):
|
| 88 |
+
regions.append((base_addr, region_size))
|
| 89 |
+
|
| 90 |
+
new_address = base_addr + region_size
|
| 91 |
+
if new_address <= address:
|
| 92 |
+
break
|
| 93 |
+
address = new_address
|
| 94 |
+
return regions
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def read_mem(address, size):
|
| 98 |
+
"""Read memory from current process - direct access since it's our own memory."""
|
| 99 |
+
try:
|
| 100 |
+
return ctypes.string_at(address, size)
|
| 101 |
+
except Exception:
|
| 102 |
+
# Fallback to ReadProcessMemory
|
| 103 |
+
try:
|
| 104 |
+
buf = (ctypes.c_ubyte * size)()
|
| 105 |
+
n = ctypes.c_size_t(0)
|
| 106 |
+
handle = kernel32.GetCurrentProcess()
|
| 107 |
+
ok = kernel32.ReadProcessMemory(
|
| 108 |
+
handle, ctypes.c_void_p(address), buf, size, ctypes.byref(n)
|
| 109 |
+
)
|
| 110 |
+
if ok and n.value > 0:
|
| 111 |
+
return bytes(buf[:n.value])
|
| 112 |
+
except Exception:
|
| 113 |
+
pass
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ═══════════════════════════════════════════════════════════════
|
| 118 |
+
# Step 1: Snapshot BEFORE loading OCR
|
| 119 |
+
# ═══════════════════════════════════════════════════════════════
|
| 120 |
+
|
| 121 |
+
print("=" * 80)
|
| 122 |
+
print("OneOCR Model Extraction via Runtime Memory Dump")
|
| 123 |
+
print("=" * 80)
|
| 124 |
+
|
| 125 |
+
print("\n[1/5] Memory snapshot BEFORE OCR load...")
|
| 126 |
+
before = set()
|
| 127 |
+
before_data = {}
|
| 128 |
+
for base, size in scan_memory_regions():
|
| 129 |
+
before.add(base)
|
| 130 |
+
# Store hash of small regions for change detection
|
| 131 |
+
if size <= 65536:
|
| 132 |
+
d = read_mem(base, size)
|
| 133 |
+
if d:
|
| 134 |
+
before_data[base] = hash(d)
|
| 135 |
+
print(f" {len(before)} regions before")
|
| 136 |
+
|
| 137 |
+
# ═════════════════════════════════════════════════��═════════════
|
| 138 |
+
# Step 2: Load DLLs
|
| 139 |
+
# ═══════════════════════════════════════════════════════════════
|
| 140 |
+
|
| 141 |
+
print("\n[2/5] Loading DLLs...")
|
| 142 |
+
os.add_dll_directory(str(OCR_DATA_DIR))
|
| 143 |
+
os.environ["PATH"] = str(OCR_DATA_DIR) + ";" + os.environ.get("PATH", "")
|
| 144 |
+
|
| 145 |
+
ort_dll = ctypes.WinDLL(ORT_DLL_PATH)
|
| 146 |
+
print(f" OK: onnxruntime.dll")
|
| 147 |
+
|
| 148 |
+
ocr_dll = ctypes.WinDLL(DLL_PATH)
|
| 149 |
+
print(f" OK: oneocr.dll")
|
| 150 |
+
|
| 151 |
+
# ═══════════════════════════════════════════════════════════════
|
| 152 |
+
# Step 3: Init OCR pipeline (triggers decryption)
|
| 153 |
+
# ═══════════════════════════════════════════════════════════════
|
| 154 |
+
|
| 155 |
+
print("\n[3/5] Creating OCR pipeline (decrypts model)...")
|
| 156 |
+
|
| 157 |
+
CreateOcrInitOptions = ocr_dll.CreateOcrInitOptions
|
| 158 |
+
CreateOcrInitOptions.restype = ctypes.c_int64
|
| 159 |
+
CreateOcrInitOptions.argtypes = [ctypes.POINTER(ctypes.c_int64)]
|
| 160 |
+
|
| 161 |
+
OcrInitOptionsSetUseModelDelayLoad = ocr_dll.OcrInitOptionsSetUseModelDelayLoad
|
| 162 |
+
OcrInitOptionsSetUseModelDelayLoad.restype = ctypes.c_int64
|
| 163 |
+
OcrInitOptionsSetUseModelDelayLoad.argtypes = [ctypes.c_int64, ctypes.c_char]
|
| 164 |
+
|
| 165 |
+
CreateOcrPipeline = ocr_dll.CreateOcrPipeline
|
| 166 |
+
CreateOcrPipeline.restype = ctypes.c_int64
|
| 167 |
+
CreateOcrPipeline.argtypes = [ctypes.c_int64, ctypes.c_int64, ctypes.c_int64, ctypes.POINTER(ctypes.c_int64)]
|
| 168 |
+
|
| 169 |
+
ctx = ctypes.c_int64(0)
|
| 170 |
+
res = CreateOcrInitOptions(ctypes.byref(ctx))
|
| 171 |
+
assert res == 0, f"CreateOcrInitOptions failed: {res}"
|
| 172 |
+
|
| 173 |
+
# Disable delay load → load ALL models immediately
|
| 174 |
+
res = OcrInitOptionsSetUseModelDelayLoad(ctx, ctypes.c_char(0))
|
| 175 |
+
assert res == 0, f"SetUseModelDelayLoad failed: {res}"
|
| 176 |
+
|
| 177 |
+
model_path_c = ctypes.c_char_p(MODEL_PATH.encode("utf-8"))
|
| 178 |
+
key_c = ctypes.c_char_p(KEY.encode("utf-8"))
|
| 179 |
+
|
| 180 |
+
pipeline = ctypes.c_int64(0)
|
| 181 |
+
res = CreateOcrPipeline(
|
| 182 |
+
ctypes.cast(model_path_c, ctypes.c_void_p).value,
|
| 183 |
+
ctypes.cast(key_c, ctypes.c_void_p).value,
|
| 184 |
+
ctx.value,
|
| 185 |
+
ctypes.byref(pipeline)
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
if res != 0:
|
| 189 |
+
print(f" ERROR: CreateOcrPipeline returned {res}")
|
| 190 |
+
sys.exit(1)
|
| 191 |
+
|
| 192 |
+
print(f" Pipeline OK! handle=0x{pipeline.value:x}")
|
| 193 |
+
time.sleep(0.5)
|
| 194 |
+
|
| 195 |
+
# ═══════════════════════════════════════════════════════════════
|
| 196 |
+
# Step 4: Find new/changed memory regions & search for ONNX
|
| 197 |
+
# ═══════════════════════════════════════════════════════════════
|
| 198 |
+
|
| 199 |
+
print("\n[4/5] Scanning process memory for ONNX models...")
|
| 200 |
+
|
| 201 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 202 |
+
|
| 203 |
+
after_regions = scan_memory_regions()
|
| 204 |
+
new_regions = [(b, s) for b, s in after_regions if b not in before]
|
| 205 |
+
print(f" Total regions after: {len(after_regions)}")
|
| 206 |
+
print(f" New regions: {len(new_regions)}")
|
| 207 |
+
|
| 208 |
+
# Size distribution of new regions
|
| 209 |
+
new_large = [(b, s) for b, s in new_regions if s >= 1024*1024]
|
| 210 |
+
new_total = sum(s for _, s in new_regions)
|
| 211 |
+
print(f" New large regions (>1MB): {len(new_large)}")
|
| 212 |
+
print(f" Total new memory: {new_total/1024/1024:.1f} MB")
|
| 213 |
+
|
| 214 |
+
found = []
|
| 215 |
+
|
| 216 |
+
# ONNX protobuf field patterns for start of file
|
| 217 |
+
# ir_version (field 1, varint) followed by opset_import (field 2, len-delimited)
|
| 218 |
+
# or producer_name (field 2, len-delimited) etc.
|
| 219 |
+
|
| 220 |
+
# Search patterns
|
| 221 |
+
PATTERNS = [
|
| 222 |
+
b"\x08\x07\x12", # ir_v=7, then field 2
|
| 223 |
+
b"\x08\x08\x12", # ir_v=8
|
| 224 |
+
b"\x08\x06\x12", # ir_v=6
|
| 225 |
+
b"\x08\x05\x12", # ir_v=5
|
| 226 |
+
b"\x08\x04\x12", # ir_v=4
|
| 227 |
+
b"\x08\x03\x12", # ir_v=3
|
| 228 |
+
b"\x08\x09\x12", # ir_v=9
|
| 229 |
+
b"ORTM", # ORT model format
|
| 230 |
+
b"ONNX", # Just in case
|
| 231 |
+
b"\x08\x07\x1a", # ir_v=7, field 3
|
| 232 |
+
b"\x08\x08\x1a", # ir_v=8, field 3
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
# Scan ALL new large regions
|
| 236 |
+
for ridx, (base, size) in enumerate(sorted(new_regions, key=lambda x: x[1], reverse=True)):
|
| 237 |
+
if size < 4096:
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
read_size = min(size, 200 * 1024 * 1024)
|
| 241 |
+
data = read_mem(base, read_size)
|
| 242 |
+
if not data:
|
| 243 |
+
continue
|
| 244 |
+
|
| 245 |
+
# Check entropy of first 4KB
|
| 246 |
+
ent = entropy(data[:4096])
|
| 247 |
+
uniq = len(set(data[:4096]))
|
| 248 |
+
|
| 249 |
+
if size >= 100000:
|
| 250 |
+
# Log large regions regardless
|
| 251 |
+
print(f" Region 0x{base:x} size={size:,} ent={ent:.2f} uniq={uniq}/256 first={data[:16].hex()}")
|
| 252 |
+
|
| 253 |
+
# Search for patterns
|
| 254 |
+
for pattern in PATTERNS:
|
| 255 |
+
offset = 0
|
| 256 |
+
while True:
|
| 257 |
+
idx = data.find(pattern, offset)
|
| 258 |
+
if idx < 0:
|
| 259 |
+
break
|
| 260 |
+
|
| 261 |
+
# Validate: check surrounding context
|
| 262 |
+
chunk = data[idx:idx+min(4096, len(data)-idx)]
|
| 263 |
+
chunk_ent = entropy(chunk[:1024]) if len(chunk) >= 1024 else entropy(chunk)
|
| 264 |
+
|
| 265 |
+
# Valid models should have moderate entropy (not encrypted high-entropy)
|
| 266 |
+
if chunk_ent < 7.5 and len(chunk) > 64:
|
| 267 |
+
addr = base + idx
|
| 268 |
+
remaining = len(data) - idx
|
| 269 |
+
found.append({
|
| 270 |
+
"addr": addr,
|
| 271 |
+
"base": base,
|
| 272 |
+
"offset": idx,
|
| 273 |
+
"size": remaining,
|
| 274 |
+
"pattern": pattern.hex(),
|
| 275 |
+
"ent": chunk_ent,
|
| 276 |
+
"first_32": data[idx:idx+32].hex(),
|
| 277 |
+
})
|
| 278 |
+
print(f" ★ ONNX candidate at 0x{addr:x}: pattern={pattern.hex()} "
|
| 279 |
+
f"ent={chunk_ent:.2f} remaining={remaining:,}")
|
| 280 |
+
print(f" First 32: {data[idx:idx+32].hex()}")
|
| 281 |
+
|
| 282 |
+
offset = idx + len(pattern)
|
| 283 |
+
|
| 284 |
+
print(f"\n Found {len(found)} ONNX candidates total")
|
| 285 |
+
|
| 286 |
+
# ═══════════════════════════════════════════════════════════════
|
| 287 |
+
# Step 5: Dump candidates
|
| 288 |
+
# ═══════════════════════════════════════════════════════════════
|
| 289 |
+
|
| 290 |
+
print("\n[5/5] Dumping models...")
|
| 291 |
+
|
| 292 |
+
if found:
|
| 293 |
+
# Deduplicate by address
|
| 294 |
+
seen = set()
|
| 295 |
+
for i, m in enumerate(found):
|
| 296 |
+
if m["addr"] in seen:
|
| 297 |
+
continue
|
| 298 |
+
seen.add(m["addr"])
|
| 299 |
+
|
| 300 |
+
dump_size = min(m["size"], 100 * 1024 * 1024)
|
| 301 |
+
data = read_mem(m["addr"], dump_size)
|
| 302 |
+
if data:
|
| 303 |
+
fname = f"onnx_{i}_0x{m['addr']:x}_{dump_size//1024}KB.bin"
|
| 304 |
+
out = OUTPUT_DIR / fname
|
| 305 |
+
with open(out, "wb") as f:
|
| 306 |
+
f.write(data)
|
| 307 |
+
print(f" Saved: {fname} ({len(data):,} bytes)")
|
| 308 |
+
else:
|
| 309 |
+
print(" No ONNX patterns found. Dumping ALL large new regions (>1MB)...")
|
| 310 |
+
|
| 311 |
+
for i, (base, size) in enumerate(new_large):
|
| 312 |
+
data = read_mem(base, min(size, 200*1024*1024))
|
| 313 |
+
if data:
|
| 314 |
+
ent = entropy(data[:4096])
|
| 315 |
+
fname = f"region_{i}_0x{base:x}_{size//1024//1024}MB_ent{ent:.1f}.bin"
|
| 316 |
+
out = OUTPUT_DIR / fname
|
| 317 |
+
with open(out, "wb") as f:
|
| 318 |
+
f.write(data)
|
| 319 |
+
print(f" Saved: {fname} ({len(data):,} bytes, ent={ent:.2f})")
|
| 320 |
+
|
| 321 |
+
# Summary
|
| 322 |
+
print("\n" + "=" * 80)
|
| 323 |
+
print("RESULTS")
|
| 324 |
+
print("=" * 80)
|
| 325 |
+
|
| 326 |
+
if OUTPUT_DIR.exists():
|
| 327 |
+
files = sorted(OUTPUT_DIR.iterdir())
|
| 328 |
+
if files:
|
| 329 |
+
total_size = sum(f.stat().st_size for f in files)
|
| 330 |
+
print(f"\nExtracted {len(files)} files ({total_size/1024/1024:.1f} MB):")
|
| 331 |
+
for f in files:
|
| 332 |
+
sz = f.stat().st_size
|
| 333 |
+
# Quick check if it's ONNX
|
| 334 |
+
with open(f, "rb") as fh:
|
| 335 |
+
header = fh.read(32)
|
| 336 |
+
print(f" {f.name}: {sz:,} bytes | first_16={header[:16].hex()}")
|
| 337 |
+
else:
|
| 338 |
+
print("\nNo files extracted.")
|
_archive/attempts/decrypt_with_static_iv.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Extract the static IV string from DLL and find how key derivation works.
|
| 3 |
+
|
| 4 |
+
Key findings from disassembly:
|
| 5 |
+
1. Static 30-byte string at RVA 0x02725C60 used as IV (truncated to 16)
|
| 6 |
+
2. SHA256(combined) used as AES key material
|
| 7 |
+
3. Combined = some_function(key_string, iv_from_data, flag)
|
| 8 |
+
4. Function at 0x18006c3d0 combines key + iv_prefix
|
| 9 |
+
|
| 10 |
+
Need to:
|
| 11 |
+
a) Read the static IV string
|
| 12 |
+
b) Disassemble function 0x18006c3d0 to understand combination
|
| 13 |
+
c) Try decryption
|
| 14 |
+
"""
|
| 15 |
+
import struct, hashlib
|
| 16 |
+
from capstone import Cs, CS_ARCH_X86, CS_MODE_64
|
| 17 |
+
|
| 18 |
+
DLL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 19 |
+
MODEL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
|
| 20 |
+
|
| 21 |
+
with open(DLL_PATH, "rb") as f:
|
| 22 |
+
dll = f.read()
|
| 23 |
+
|
| 24 |
+
with open(MODEL_PATH, "rb") as f:
|
| 25 |
+
model = f.read()
|
| 26 |
+
|
| 27 |
+
# Parse PE sections for RVA → file offset mapping
|
| 28 |
+
e_lfanew = struct.unpack_from('<I', dll, 0x3c)[0]
|
| 29 |
+
num_sections = struct.unpack_from('<H', dll, e_lfanew + 6)[0]
|
| 30 |
+
opt_size = struct.unpack_from('<H', dll, e_lfanew + 20)[0]
|
| 31 |
+
sections_off = e_lfanew + 24 + opt_size
|
| 32 |
+
|
| 33 |
+
sections = []
|
| 34 |
+
for i in range(num_sections):
|
| 35 |
+
so = sections_off + i * 40
|
| 36 |
+
name = dll[so:so+8].rstrip(b'\x00').decode('ascii', errors='replace')
|
| 37 |
+
vsize = struct.unpack_from('<I', dll, so + 8)[0]
|
| 38 |
+
vrva = struct.unpack_from('<I', dll, so + 12)[0]
|
| 39 |
+
rawsize = struct.unpack_from('<I', dll, so + 16)[0]
|
| 40 |
+
rawoff = struct.unpack_from('<I', dll, so + 20)[0]
|
| 41 |
+
sections.append((name, vrva, vsize, rawoff, rawsize))
|
| 42 |
+
print(f"Section {name}: RVA=0x{vrva:08x} VSize=0x{vsize:08x} Raw=0x{rawoff:08x} RawSize=0x{rawsize:08x}")
|
| 43 |
+
|
| 44 |
+
def rva_to_foff(rva):
|
| 45 |
+
for name, vrva, vsize, rawoff, rawsize in sections:
|
| 46 |
+
if vrva <= rva < vrva + rawsize:
|
| 47 |
+
return rawoff + (rva - vrva)
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
IMAGE_BASE = 0x180000000
|
| 51 |
+
TEXT_VA = 0x1000
|
| 52 |
+
TEXT_FILE_OFFSET = 0x400
|
| 53 |
+
|
| 54 |
+
def text_rva_to_file(rva):
|
| 55 |
+
return rva - TEXT_VA + TEXT_FILE_OFFSET
|
| 56 |
+
|
| 57 |
+
# 1. Read the static 30-byte string at RVA 0x02725C60
|
| 58 |
+
# The LEA instruction was at RVA 0x0015baac:
|
| 59 |
+
# lea rdx, [rip + 0x25ca1ad]
|
| 60 |
+
# RIP = 0x0015baac + 7 = 0x0015bab3
|
| 61 |
+
# Target RVA = 0x0015bab3 + 0x25ca1ad = ?
|
| 62 |
+
target_rva = 0x0015bab3 + 0x25ca1ad
|
| 63 |
+
print(f"\nStatic IV string RVA: 0x{target_rva:08x}")
|
| 64 |
+
foff = rva_to_foff(target_rva)
|
| 65 |
+
print(f"File offset: 0x{foff:08x}" if foff else "NOT FOUND")
|
| 66 |
+
|
| 67 |
+
if foff:
|
| 68 |
+
static_iv_30 = dll[foff:foff+30]
|
| 69 |
+
print(f"Static IV (30 bytes): {static_iv_30.hex()}")
|
| 70 |
+
print(f"Static IV (30 chars): {static_iv_30}")
|
| 71 |
+
static_iv_16 = static_iv_30[:16]
|
| 72 |
+
print(f"Static IV truncated to 16: {static_iv_16.hex()}")
|
| 73 |
+
print(f"Static IV truncated (repr): {static_iv_16}")
|
| 74 |
+
|
| 75 |
+
# Also check nearby strings for context
|
| 76 |
+
if foff:
|
| 77 |
+
print(f"\nContext around static IV string (foff-16 to foff+48):")
|
| 78 |
+
for i in range(-16, 64):
|
| 79 |
+
c = dll[foff+i]
|
| 80 |
+
print(f" +{i:3d}: 0x{c:02x} ({chr(c) if 32 <= c < 127 else '.'})")
|
| 81 |
+
|
| 82 |
+
# 2. Read the first 16 bytes of encrypted data (the "prefix" extracted before key derivation)
|
| 83 |
+
header_offset = struct.unpack_from('<Q', model, 0)[0]
|
| 84 |
+
prefix_16 = model[8:24]
|
| 85 |
+
print(f"\nData prefix (first 16 bytes after offset): {prefix_16.hex()}")
|
| 86 |
+
print(f"Data prefix repr: {prefix_16}")
|
| 87 |
+
|
| 88 |
+
# 3. Disassemble the key combination function at 0x18006c3d0
|
| 89 |
+
# RVA = 0x0006c3d0
|
| 90 |
+
combo_rva = 0x0006c3d0
|
| 91 |
+
combo_foff = rva_to_foff(combo_rva)
|
| 92 |
+
print(f"\nKey combination function RVA: 0x{combo_rva:08x}, file: 0x{combo_foff:08x}" if combo_foff else "NOT FOUND")
|
| 93 |
+
|
| 94 |
+
md = Cs(CS_ARCH_X86, CS_MODE_64)
|
| 95 |
+
md.detail = False
|
| 96 |
+
|
| 97 |
+
if combo_foff:
|
| 98 |
+
code = dll[combo_foff:combo_foff + 0x200]
|
| 99 |
+
print(f"\n{'='*100}")
|
| 100 |
+
print(f"Key combination function at RVA 0x{combo_rva:08x}")
|
| 101 |
+
print(f"{'='*100}")
|
| 102 |
+
for insn in md.disasm(code, IMAGE_BASE + combo_rva):
|
| 103 |
+
foff2 = rva_to_foff(insn.address - IMAGE_BASE)
|
| 104 |
+
line = f" {insn.address - IMAGE_BASE:08x} ({foff2:08x}): {insn.bytes.hex():<40s} {insn.mnemonic:<10s} {insn.op_str}"
|
| 105 |
+
if insn.mnemonic == 'ret':
|
| 106 |
+
print(line)
|
| 107 |
+
break
|
| 108 |
+
print(line)
|
| 109 |
+
|
| 110 |
+
# 4. Now let's also check what string is compared when key is empty
|
| 111 |
+
# At 0x0015b88d: lea rdx, [rip + 0x25b59db]
|
| 112 |
+
# RIP = 0x0015b88d + 7 = 0x0015b894
|
| 113 |
+
# Target = 0x0015b894 + 0x25b59db = ?
|
| 114 |
+
empty_key_str_rva = 0x0015b894 + 0x25b59db
|
| 115 |
+
empty_foff = rva_to_foff(empty_key_str_rva)
|
| 116 |
+
if empty_foff:
|
| 117 |
+
s = dll[empty_foff:empty_foff+64].split(b'\x00')[0]
|
| 118 |
+
print(f"\nDefault key comparison string: {s}")
|
| 119 |
+
|
| 120 |
+
# 5. Try decryption with various key derivation methods
|
| 121 |
+
print("\n" + "="*100)
|
| 122 |
+
print("DECRYPTION ATTEMPTS WITH STATIC IV AND DERIVED KEY")
|
| 123 |
+
print("="*100)
|
| 124 |
+
|
| 125 |
+
KEY_RAW = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 126 |
+
|
| 127 |
+
# Encrypted header: model[8 : header_offset]
|
| 128 |
+
enc_header = model[8:header_offset]
|
| 129 |
+
# The prefix/IV: first 16 bytes
|
| 130 |
+
data_prefix = enc_header[:16]
|
| 131 |
+
# The ciphertext: remaining bytes
|
| 132 |
+
ciphertext = enc_header[16:]
|
| 133 |
+
|
| 134 |
+
MAGIC = 0x252b081a4a
|
| 135 |
+
|
| 136 |
+
from Crypto.Cipher import AES
|
| 137 |
+
|
| 138 |
+
def try_dec(aes_key, iv, ct, label):
|
| 139 |
+
"""Try AES-256-CFB128 decryption."""
|
| 140 |
+
try:
|
| 141 |
+
cipher = AES.new(aes_key, AES.MODE_CFB, iv=iv, segment_size=128)
|
| 142 |
+
pt = cipher.decrypt(ct[:256])
|
| 143 |
+
if len(pt) >= 24:
|
| 144 |
+
magic = struct.unpack_from('<Q', pt, 0x10)[0]
|
| 145 |
+
if magic == MAGIC:
|
| 146 |
+
print(f" *** SUCCESS *** {label}")
|
| 147 |
+
print(f" First 64 bytes: {pt[:64].hex()}")
|
| 148 |
+
return True
|
| 149 |
+
else:
|
| 150 |
+
unique = len(set(pt[:64]))
|
| 151 |
+
# Only print if somewhat promising
|
| 152 |
+
if unique < 45 or magic & 0xFF == 0x4a:
|
| 153 |
+
print(f" {label}: magic=0x{magic:016x}, unique_64={unique}")
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f" {label}: ERROR {e}")
|
| 156 |
+
return False
|
| 157 |
+
|
| 158 |
+
if foff:
|
| 159 |
+
iv = static_iv_16
|
| 160 |
+
|
| 161 |
+
# Try 1: SHA256(key + data_prefix)
|
| 162 |
+
combined1 = KEY_RAW + data_prefix
|
| 163 |
+
aes_key1 = hashlib.sha256(combined1).digest()
|
| 164 |
+
try_dec(aes_key1, iv, ciphertext, "SHA256(key + data_prefix)")
|
| 165 |
+
|
| 166 |
+
# Try 2: SHA256(data_prefix + key)
|
| 167 |
+
combined2 = data_prefix + KEY_RAW
|
| 168 |
+
aes_key2 = hashlib.sha256(combined2).digest()
|
| 169 |
+
try_dec(aes_key2, iv, ciphertext, "SHA256(data_prefix + key)")
|
| 170 |
+
|
| 171 |
+
# Try 3: SHA256(key) with static IV
|
| 172 |
+
aes_key3 = hashlib.sha256(KEY_RAW).digest()
|
| 173 |
+
try_dec(aes_key3, iv, ciphertext, "SHA256(key) + static_iv")
|
| 174 |
+
|
| 175 |
+
# Try 4: Raw key with static IV
|
| 176 |
+
try_dec(KEY_RAW, iv, ciphertext, "raw_key + static_iv")
|
| 177 |
+
|
| 178 |
+
# Try 5: SHA256(key + data_prefix) on full enc_header (no prefix removal)
|
| 179 |
+
try_dec(aes_key1, iv, enc_header, "SHA256(key+prefix) + full_header")
|
| 180 |
+
try_dec(aes_key2, iv, enc_header, "SHA256(prefix+key) + full_header")
|
| 181 |
+
|
| 182 |
+
# Try 6: Maybe prefix is NOT stripped from ciphertext for BCrypt
|
| 183 |
+
try_dec(aes_key3, iv, enc_header, "SHA256(key) + static_iv + full_header")
|
| 184 |
+
try_dec(KEY_RAW, iv, enc_header, "raw_key + static_iv + full_header")
|
| 185 |
+
|
| 186 |
+
# Also try the full static_iv_30 string as both key and IV source
|
| 187 |
+
# Maybe the static string IS the key, and data_prefix IS the IV
|
| 188 |
+
try_dec(hashlib.sha256(static_iv_30).digest(), data_prefix, ciphertext, "SHA256(static30) + data_prefix_iv")
|
| 189 |
+
|
| 190 |
+
# What if key derivation involves the static string too?
|
| 191 |
+
# SHA256(key + static_string)
|
| 192 |
+
combined3 = KEY_RAW + static_iv_30
|
| 193 |
+
aes_key6 = hashlib.sha256(combined3).digest()
|
| 194 |
+
try_dec(aes_key6, data_prefix, ciphertext, "SHA256(key + static30) + prefix_iv")
|
| 195 |
+
try_dec(aes_key6, iv, ciphertext, "SHA256(key + static30) + static_iv")
|
| 196 |
+
|
| 197 |
+
# What if the function combines key with static string, and data_prefix is IV?
|
| 198 |
+
# Try many concatenation variants
|
| 199 |
+
variants = [
|
| 200 |
+
(KEY_RAW + data_prefix, iv, ciphertext, "key||prefix"),
|
| 201 |
+
(data_prefix + KEY_RAW, iv, ciphertext, "prefix||key"),
|
| 202 |
+
(KEY_RAW + static_iv_16, iv, ciphertext, "key||static16"),
|
| 203 |
+
(KEY_RAW + static_iv_30, iv, ciphertext, "key||static30"),
|
| 204 |
+
(static_iv_16 + KEY_RAW, iv, ciphertext, "static16||key"),
|
| 205 |
+
(static_iv_30 + KEY_RAW, iv, ciphertext, "static30||key"),
|
| 206 |
+
(KEY_RAW + data_prefix, data_prefix, ciphertext, "key||prefix, iv=prefix"),
|
| 207 |
+
(data_prefix + KEY_RAW, data_prefix, ciphertext, "prefix||key, iv=prefix"),
|
| 208 |
+
]
|
| 209 |
+
|
| 210 |
+
for combo, iv_used, ct, desc in variants:
|
| 211 |
+
aes_key = hashlib.sha256(combo).digest()
|
| 212 |
+
try_dec(aes_key, iv_used, ct, f"SHA256({desc})")
|
| 213 |
+
|
| 214 |
+
# Maybe the function at 0x06c3d0 does something more complex
|
| 215 |
+
# Let's also try: the "combined" is just the key (no IV involvement),
|
| 216 |
+
# and the function just copies/formats the key
|
| 217 |
+
# With different IV sources
|
| 218 |
+
|
| 219 |
+
# Try with BCrypt API directly
|
| 220 |
+
print("\n--- BCrypt API tests with static IV ---")
|
| 221 |
+
import ctypes
|
| 222 |
+
bcrypt = ctypes.windll.bcrypt
|
| 223 |
+
|
| 224 |
+
def bcrypt_dec(key_bytes, iv_bytes, ct_bytes, label):
|
| 225 |
+
hAlg = ctypes.c_void_p()
|
| 226 |
+
status = bcrypt.BCryptOpenAlgorithmProvider(ctypes.byref(hAlg), "AES", None, 0)
|
| 227 |
+
if status != 0:
|
| 228 |
+
print(f" {label}: OpenAlg failed {status}")
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
+
mode = "ChainingModeCFB".encode('utf-16-le') + b'\x00\x00'
|
| 232 |
+
bcrypt.BCryptSetProperty(hAlg, "ChainingMode", mode, len(mode), 0)
|
| 233 |
+
|
| 234 |
+
block_len = ctypes.c_ulong(16)
|
| 235 |
+
bcrypt.BCryptSetProperty(hAlg, "MessageBlockLength",
|
| 236 |
+
ctypes.byref(block_len), 4, 0)
|
| 237 |
+
|
| 238 |
+
hKey = ctypes.c_void_p()
|
| 239 |
+
kb = (ctypes.c_byte * len(key_bytes))(*key_bytes)
|
| 240 |
+
status = bcrypt.BCryptGenerateSymmetricKey(
|
| 241 |
+
hAlg, ctypes.byref(hKey), None, 0, kb, len(key_bytes), 0)
|
| 242 |
+
if status != 0:
|
| 243 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 244 |
+
print(f" {label}: GenKey failed {status}")
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
ct_buf = (ctypes.c_byte * len(ct_bytes))(*ct_bytes)
|
| 248 |
+
iv_buf = (ctypes.c_byte * len(iv_bytes))(*iv_bytes)
|
| 249 |
+
|
| 250 |
+
out_size = ctypes.c_ulong(0)
|
| 251 |
+
status = bcrypt.BCryptDecrypt(
|
| 252 |
+
hKey, ct_buf, len(ct_bytes), None,
|
| 253 |
+
iv_buf, len(iv_bytes), None, 0,
|
| 254 |
+
ctypes.byref(out_size), 0)
|
| 255 |
+
|
| 256 |
+
if status != 0:
|
| 257 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 258 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 259 |
+
print(f" {label}: Decrypt size query failed {status:#x}")
|
| 260 |
+
return None
|
| 261 |
+
|
| 262 |
+
pt_buf = (ctypes.c_byte * out_size.value)()
|
| 263 |
+
iv_buf2 = (ctypes.c_byte * len(iv_bytes))(*iv_bytes)
|
| 264 |
+
result = ctypes.c_ulong(0)
|
| 265 |
+
status = bcrypt.BCryptDecrypt(
|
| 266 |
+
hKey, ct_buf, len(ct_bytes), None,
|
| 267 |
+
iv_buf2, len(iv_bytes), pt_buf, out_size.value,
|
| 268 |
+
ctypes.byref(result), 0)
|
| 269 |
+
|
| 270 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 271 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 272 |
+
|
| 273 |
+
if status != 0:
|
| 274 |
+
print(f" {label}: Decrypt failed {status:#x}")
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
pt = bytes(pt_buf[:result.value])
|
| 278 |
+
if len(pt) >= 24:
|
| 279 |
+
magic = struct.unpack_from('<Q', pt, 0x10)[0]
|
| 280 |
+
if magic == MAGIC:
|
| 281 |
+
print(f" *** BCrypt SUCCESS *** {label}")
|
| 282 |
+
print(f" First 64: {pt[:64].hex()}")
|
| 283 |
+
return pt
|
| 284 |
+
return pt
|
| 285 |
+
|
| 286 |
+
# BCrypt tests with various key derivations
|
| 287 |
+
for combo_data, desc in [
|
| 288 |
+
(KEY_RAW, "raw_key"),
|
| 289 |
+
(hashlib.sha256(KEY_RAW).digest(), "SHA256(key)"),
|
| 290 |
+
(hashlib.sha256(KEY_RAW + data_prefix).digest(), "SHA256(key+prefix)"),
|
| 291 |
+
(hashlib.sha256(data_prefix + KEY_RAW).digest(), "SHA256(prefix+key)"),
|
| 292 |
+
]:
|
| 293 |
+
for iv_data, iv_desc in [(iv, "static16"), (data_prefix, "data_prefix")]:
|
| 294 |
+
for ct_data, ct_desc in [(ciphertext, "ct_no_prefix"), (enc_header, "full_header")]:
|
| 295 |
+
result = bcrypt_dec(combo_data, iv_data, ct_data[:512],
|
| 296 |
+
f"key={desc}, iv={iv_desc}, ct={ct_desc}")
|
| 297 |
+
if result:
|
| 298 |
+
magic = struct.unpack_from('<Q', result, 0x10)[0] if len(result) >= 24 else 0
|
| 299 |
+
if magic == MAGIC:
|
| 300 |
+
print("FOUND THE CORRECT PARAMETERS!")
|
| 301 |
+
|
| 302 |
+
print("\nDone.")
|
_archive/attempts/disasm_bcrypt_calls.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Disassemble the actual BCrypt crypto operations at 0x18015ba45+
|
| 3 |
+
and map all indirect calls to IAT entries.
|
| 4 |
+
"""
|
| 5 |
+
import struct
|
| 6 |
+
from capstone import Cs, CS_ARCH_X86, CS_MODE_64
|
| 7 |
+
|
| 8 |
+
DLL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 9 |
+
IMAGE_BASE = 0x180000000
|
| 10 |
+
TEXT_VA = 0x1000
|
| 11 |
+
TEXT_FILE_OFFSET = 0x400
|
| 12 |
+
|
| 13 |
+
def rva_to_file(rva):
|
| 14 |
+
return rva - TEXT_VA + TEXT_FILE_OFFSET
|
| 15 |
+
|
| 16 |
+
def file_to_rva(foff):
|
| 17 |
+
return foff - TEXT_FILE_OFFSET + TEXT_VA
|
| 18 |
+
|
| 19 |
+
with open(DLL_PATH, "rb") as f:
|
| 20 |
+
dll_data = f.read()
|
| 21 |
+
|
| 22 |
+
md = Cs(CS_ARCH_X86, CS_MODE_64)
|
| 23 |
+
md.detail = False
|
| 24 |
+
|
| 25 |
+
def disasm_region(name, file_start, file_end):
|
| 26 |
+
rva_start = file_to_rva(file_start)
|
| 27 |
+
va_start = IMAGE_BASE + rva_start
|
| 28 |
+
code = dll_data[file_start:file_end]
|
| 29 |
+
print(f"\n{'='*100}")
|
| 30 |
+
print(f"{name}")
|
| 31 |
+
print(f"File: 0x{file_start:08x}-0x{file_end:08x}, RVA: 0x{rva_start:08x}")
|
| 32 |
+
print(f"{'='*100}")
|
| 33 |
+
for insn in md.disasm(code, va_start):
|
| 34 |
+
foff = rva_to_file(insn.address - IMAGE_BASE)
|
| 35 |
+
line = f" {insn.address - IMAGE_BASE:08x} ({foff:08x}): {insn.bytes.hex():<40s} {insn.mnemonic:<14s} {insn.op_str}"
|
| 36 |
+
# Annotate indirect calls
|
| 37 |
+
if insn.mnemonic == 'call' and insn.bytes[0] == 0xFF and insn.bytes[1] == 0x15:
|
| 38 |
+
disp = struct.unpack_from('<i', bytes(insn.bytes), 2)[0]
|
| 39 |
+
target_rva = (insn.address - IMAGE_BASE) + insn.size + disp
|
| 40 |
+
line += f" ; IAT@0x{target_rva:08x}"
|
| 41 |
+
print(line)
|
| 42 |
+
|
| 43 |
+
# First, let's identify ALL BCrypt IAT entries
|
| 44 |
+
# From previous analysis:
|
| 45 |
+
# BCryptOpenAlgorithmProvider → IAT 0x0081a5e0
|
| 46 |
+
# BCryptGetProperty → IAT 0x0081a5d0
|
| 47 |
+
# BCryptSetProperty → IAT 0x0081a608
|
| 48 |
+
# Let's find the rest by looking at the import section
|
| 49 |
+
|
| 50 |
+
# Parse PE to find BCrypt imports
|
| 51 |
+
print("="*100)
|
| 52 |
+
print("FINDING ALL BCRYPT IAT ENTRIES")
|
| 53 |
+
print("="*100)
|
| 54 |
+
|
| 55 |
+
# Parse PE headers
|
| 56 |
+
e_lfanew = struct.unpack_from('<I', dll_data, 0x3c)[0]
|
| 57 |
+
opt_hdr_off = e_lfanew + 24
|
| 58 |
+
import_dir_rva = struct.unpack_from('<I', dll_data, opt_hdr_off + 120)[0] # Import RVA
|
| 59 |
+
import_dir_size = struct.unpack_from('<I', dll_data, opt_hdr_off + 124)[0]
|
| 60 |
+
|
| 61 |
+
# Find sections for RVA to file offset mapping
|
| 62 |
+
num_sections = struct.unpack_from('<H', dll_data, e_lfanew + 6)[0]
|
| 63 |
+
sections_off = e_lfanew + 24 + struct.unpack_from('<H', dll_data, e_lfanew + 20)[0]
|
| 64 |
+
|
| 65 |
+
sections = []
|
| 66 |
+
for i in range(num_sections):
|
| 67 |
+
sec_off = sections_off + i * 40
|
| 68 |
+
name = dll_data[sec_off:sec_off+8].rstrip(b'\x00').decode('ascii', errors='replace')
|
| 69 |
+
vsize = struct.unpack_from('<I', dll_data, sec_off + 8)[0]
|
| 70 |
+
vrva = struct.unpack_from('<I', dll_data, sec_off + 12)[0]
|
| 71 |
+
rawsize = struct.unpack_from('<I', dll_data, sec_off + 16)[0]
|
| 72 |
+
rawoff = struct.unpack_from('<I', dll_data, sec_off + 20)[0]
|
| 73 |
+
sections.append((name, vrva, vsize, rawoff, rawsize))
|
| 74 |
+
|
| 75 |
+
def rva_to_foff(rva):
|
| 76 |
+
for name, vrva, vsize, rawoff, rawsize in sections:
|
| 77 |
+
if vrva <= rva < vrva + vsize:
|
| 78 |
+
return rawoff + (rva - vrva)
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
if import_dir_rva:
|
| 82 |
+
ioff = rva_to_foff(import_dir_rva)
|
| 83 |
+
if ioff:
|
| 84 |
+
idx = 0
|
| 85 |
+
while True:
|
| 86 |
+
desc_off = ioff + idx * 20
|
| 87 |
+
ilt_rva = struct.unpack_from('<I', dll_data, desc_off)[0]
|
| 88 |
+
name_rva = struct.unpack_from('<I', dll_data, desc_off + 12)[0]
|
| 89 |
+
iat_rva = struct.unpack_from('<I', dll_data, desc_off + 16)[0]
|
| 90 |
+
if ilt_rva == 0 and name_rva == 0:
|
| 91 |
+
break
|
| 92 |
+
name_off = rva_to_foff(name_rva)
|
| 93 |
+
if name_off:
|
| 94 |
+
dname = dll_data[name_off:name_off+64].split(b'\x00')[0].decode('ascii', errors='replace')
|
| 95 |
+
if 'bcrypt' in dname.lower():
|
| 96 |
+
print(f"\nDLL: {dname}, ILT RVA: 0x{ilt_rva:08x}, IAT RVA: 0x{iat_rva:08x}")
|
| 97 |
+
# Walk the ILT to find function names
|
| 98 |
+
ilt_off = rva_to_foff(ilt_rva)
|
| 99 |
+
iat_entry_rva = iat_rva
|
| 100 |
+
j = 0
|
| 101 |
+
while ilt_off:
|
| 102 |
+
entry = struct.unpack_from('<Q', dll_data, ilt_off + j * 8)[0]
|
| 103 |
+
if entry == 0:
|
| 104 |
+
break
|
| 105 |
+
if entry & (1 << 63):
|
| 106 |
+
ordinal = entry & 0xFFFF
|
| 107 |
+
print(f" IAT 0x{iat_entry_rva:08x}: Ordinal {ordinal}")
|
| 108 |
+
else:
|
| 109 |
+
hint_rva = entry & 0x7FFFFFFF
|
| 110 |
+
hint_off = rva_to_foff(hint_rva)
|
| 111 |
+
if hint_off:
|
| 112 |
+
hint = struct.unpack_from('<H', dll_data, hint_off)[0]
|
| 113 |
+
fname = dll_data[hint_off+2:hint_off+66].split(b'\x00')[0].decode('ascii', errors='replace')
|
| 114 |
+
print(f" IAT 0x{iat_entry_rva:08x}: {fname} (hint {hint})")
|
| 115 |
+
iat_entry_rva += 8
|
| 116 |
+
j += 1
|
| 117 |
+
idx += 1
|
| 118 |
+
|
| 119 |
+
# Now disassemble the actual BCrypt crypto operation block
|
| 120 |
+
# This is at RVA 0x0015ba45 = file 0x0015ae45
|
| 121 |
+
disasm_region(
|
| 122 |
+
"BCrypt crypto operations (key gen + decrypt)",
|
| 123 |
+
0x0015ae45, 0x0015b200
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Find all indirect calls in extended range
|
| 127 |
+
print("\n" + "="*100)
|
| 128 |
+
print("ALL INDIRECT CALLS (ff 15) in range 0x0015ae00-0x0015c200")
|
| 129 |
+
print("="*100)
|
| 130 |
+
|
| 131 |
+
for i in range(0x0015ae00, 0x0015c200):
|
| 132 |
+
if dll_data[i] == 0xFF and dll_data[i+1] == 0x15:
|
| 133 |
+
rva = file_to_rva(i)
|
| 134 |
+
disp = struct.unpack_from('<i', dll_data, i + 2)[0]
|
| 135 |
+
target_rva = rva + 6 + disp
|
| 136 |
+
print(f" File 0x{i:08x} (RVA 0x{rva:08x}): call [rip+0x{disp:x}] -> IAT@0x{target_rva:08x}")
|
| 137 |
+
|
| 138 |
+
# Also disassemble the function at 0x18015abd0 (called to process data when r14b=true)
|
| 139 |
+
# RVA 0x0015abd0, file 0x00159fd0
|
| 140 |
+
disasm_region(
|
| 141 |
+
"Function at 0x18015abd0 (called on data when r14b=true)",
|
| 142 |
+
0x00159fd0, 0x0015a0c0
|
| 143 |
+
)
|
_archive/attempts/disasm_crypto.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Disassemble the Cipher function in oneocr.dll to find the exact crypto parameters.
|
| 3 |
+
Find code references to the crypto strings we identified.
|
| 4 |
+
"""
|
| 5 |
+
import struct
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
dll_path = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 9 |
+
with open(dll_path, "rb") as f:
|
| 10 |
+
data = f.read()
|
| 11 |
+
|
| 12 |
+
# Parse PE headers to find section info
|
| 13 |
+
pe_sig_offset = struct.unpack_from("<I", data, 0x3C)[0]
|
| 14 |
+
assert data[pe_sig_offset:pe_sig_offset+4] == b"PE\x00\x00"
|
| 15 |
+
|
| 16 |
+
# COFF header
|
| 17 |
+
coff_start = pe_sig_offset + 4
|
| 18 |
+
num_sections = struct.unpack_from("<H", data, coff_start + 2)[0]
|
| 19 |
+
opt_header_size = struct.unpack_from("<H", data, coff_start + 16)[0]
|
| 20 |
+
|
| 21 |
+
# Optional header
|
| 22 |
+
opt_start = coff_start + 20
|
| 23 |
+
magic = struct.unpack_from("<H", data, opt_start)[0]
|
| 24 |
+
assert magic == 0x20B # PE32+
|
| 25 |
+
|
| 26 |
+
image_base = struct.unpack_from("<Q", data, opt_start + 24)[0]
|
| 27 |
+
|
| 28 |
+
# Sections
|
| 29 |
+
section_start = opt_start + opt_header_size
|
| 30 |
+
sections = []
|
| 31 |
+
for i in range(num_sections):
|
| 32 |
+
s_off = section_start + i * 40
|
| 33 |
+
name = data[s_off:s_off+8].rstrip(b"\x00").decode("ascii", errors="replace")
|
| 34 |
+
vsize = struct.unpack_from("<I", data, s_off + 8)[0]
|
| 35 |
+
va = struct.unpack_from("<I", data, s_off + 12)[0]
|
| 36 |
+
raw_size = struct.unpack_from("<I", data, s_off + 16)[0]
|
| 37 |
+
raw_ptr = struct.unpack_from("<I", data, s_off + 20)[0]
|
| 38 |
+
sections.append((name, va, vsize, raw_ptr, raw_size))
|
| 39 |
+
print(f"Section: {name:10s} VA=0x{va:08x} VSize=0x{vsize:08x} RawPtr=0x{raw_ptr:08x} RawSize=0x{raw_size:08x}")
|
| 40 |
+
|
| 41 |
+
print(f"\nImage base: 0x{image_base:016x}")
|
| 42 |
+
|
| 43 |
+
def rva_to_file_offset(rva):
|
| 44 |
+
for name, va, vsize, raw_ptr, raw_size in sections:
|
| 45 |
+
if va <= rva < va + vsize:
|
| 46 |
+
return raw_ptr + (rva - va)
|
| 47 |
+
return None
|
| 48 |
+
|
| 49 |
+
def file_offset_to_rva(offset):
|
| 50 |
+
for name, va, vsize, raw_ptr, raw_size in sections:
|
| 51 |
+
if raw_ptr <= offset < raw_ptr + raw_size:
|
| 52 |
+
return va + (offset - raw_ptr)
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
# Key string offsets we found
|
| 56 |
+
crypto_strings = {
|
| 57 |
+
"SHA256 (wide)": 0x02724b60,
|
| 58 |
+
"AES (wide)": 0x02724b70,
|
| 59 |
+
"BlockLength (wide)": 0x02724b78,
|
| 60 |
+
"ChainingModeCFB (wide)": 0x02724b90,
|
| 61 |
+
"meta->magic_number == MAGIC_NUMBER": 0x02724bb0,
|
| 62 |
+
"Unable to uncompress": 0x02724bd8,
|
| 63 |
+
"Crypto.cpp": 0x02724c08,
|
| 64 |
+
"Error returned from crypto API": 0x02724c40,
|
| 65 |
+
"ChainingMode (wide)": 0x02724c80,
|
| 66 |
+
"MessageBlockLength (wide)": 0x02724ca0,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Calculate RVAs of these strings
|
| 70 |
+
print("\n=== String RVAs ===")
|
| 71 |
+
for name, file_off in crypto_strings.items():
|
| 72 |
+
rva = file_offset_to_rva(file_off)
|
| 73 |
+
if rva:
|
| 74 |
+
print(f" {name}: file=0x{file_off:08x} RVA=0x{rva:08x}")
|
| 75 |
+
|
| 76 |
+
# Find code references to these strings via LEA instruction patterns
|
| 77 |
+
# In x64, LEA reg, [rip+disp32] is encoded as:
|
| 78 |
+
# 48 8D xx yy yy yy yy (where xx determines the register)
|
| 79 |
+
# or 4C 8D xx yy yy yy yy
|
| 80 |
+
# The target address = instruction_address + 7 + disp32
|
| 81 |
+
|
| 82 |
+
print("\n=== Searching for code references to crypto strings ===")
|
| 83 |
+
|
| 84 |
+
# Focus on the most important strings
|
| 85 |
+
key_strings = {
|
| 86 |
+
"ChainingModeCFB (wide)": 0x02724b90,
|
| 87 |
+
"SHA256 (wide)": 0x02724b60,
|
| 88 |
+
"AES (wide)": 0x02724b70,
|
| 89 |
+
"Crypto.cpp": 0x02724c08,
|
| 90 |
+
"MessageBlockLength (wide)": 0x02724ca0,
|
| 91 |
+
"meta->magic_number": 0x02724bb0,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Find the .text section (code)
|
| 95 |
+
text_section = None
|
| 96 |
+
for name, va, vsize, raw_ptr, raw_size in sections:
|
| 97 |
+
if name == ".text":
|
| 98 |
+
text_section = (va, vsize, raw_ptr, raw_size)
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
if text_section:
|
| 102 |
+
text_va, text_vsize, text_raw, text_rawsize = text_section
|
| 103 |
+
print(f"\n.text section: VA=0x{text_va:08x} size=0x{text_vsize:08x}")
|
| 104 |
+
|
| 105 |
+
for string_name, string_file_off in key_strings.items():
|
| 106 |
+
string_rva = file_offset_to_rva(string_file_off)
|
| 107 |
+
if string_rva is None:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# Search for LEA instructions referencing this RVA
|
| 111 |
+
# LEA uses RIP-relative addressing: target = RIP + disp32
|
| 112 |
+
# RIP at instruction = instruction_RVA + instruction_length (typically 7 for LEA)
|
| 113 |
+
refs_found = []
|
| 114 |
+
|
| 115 |
+
for code_off in range(text_raw, text_raw + text_rawsize - 7):
|
| 116 |
+
# Check for LEA patterns
|
| 117 |
+
b0 = data[code_off]
|
| 118 |
+
b1 = data[code_off + 1]
|
| 119 |
+
|
| 120 |
+
# 48 8D 0D/15/05/1D/25/2D/35/3D = LEA with REX.W
|
| 121 |
+
# 4C 8D 05/0D/15/1D/25/2D/35/3D = LEA with REX.WR
|
| 122 |
+
if b0 in (0x48, 0x4C) and b1 == 0x8D:
|
| 123 |
+
modrm = data[code_off + 2]
|
| 124 |
+
if (modrm & 0xC7) == 0x05: # mod=00, rm=101 (RIP-relative)
|
| 125 |
+
disp32 = struct.unpack_from("<i", data, code_off + 3)[0]
|
| 126 |
+
instr_rva = file_offset_to_rva(code_off)
|
| 127 |
+
if instr_rva is None:
|
| 128 |
+
continue
|
| 129 |
+
target_rva = instr_rva + 7 + disp32
|
| 130 |
+
if target_rva == string_rva:
|
| 131 |
+
reg_idx = (modrm >> 3) & 7
|
| 132 |
+
if b0 == 0x4C:
|
| 133 |
+
reg_idx += 8
|
| 134 |
+
reg_names = ["rax","rcx","rdx","rbx","rsp","rbp","rsi","rdi",
|
| 135 |
+
"r8","r9","r10","r11","r12","r13","r14","r15"]
|
| 136 |
+
reg = reg_names[reg_idx]
|
| 137 |
+
refs_found.append((code_off, instr_rva, reg))
|
| 138 |
+
|
| 139 |
+
if refs_found:
|
| 140 |
+
print(f"\n References to '{string_name}' (RVA=0x{string_rva:08x}):")
|
| 141 |
+
for code_off, instr_rva, reg in refs_found[:5]:
|
| 142 |
+
print(f" at file=0x{code_off:08x} RVA=0x{instr_rva:08x}: LEA {reg}, [{string_name}]")
|
| 143 |
+
# Dump surrounding code
|
| 144 |
+
ctx_start = max(text_raw, code_off - 64)
|
| 145 |
+
ctx_end = min(text_raw + text_rawsize, code_off + 128)
|
| 146 |
+
|
| 147 |
+
# Simple bytecode dump with some x64 instruction markers
|
| 148 |
+
print(f" Context (file offset 0x{ctx_start:08x} - 0x{ctx_end:08x}):")
|
| 149 |
+
for i in range(ctx_start, ctx_end, 16):
|
| 150 |
+
chunk = data[i:i+16]
|
| 151 |
+
hex_part = " ".join(f"{b:02x}" for b in chunk)
|
| 152 |
+
rva_i = file_offset_to_rva(i)
|
| 153 |
+
marker = " <<<" if i <= code_off < i + 16 else ""
|
| 154 |
+
print(f" {rva_i:08x}: {hex_part}{marker}")
|
| 155 |
+
else:
|
| 156 |
+
print(f"\n No code references found for '{string_name}'")
|
_archive/attempts/disasm_full_cipher.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Full disassembly of the Cipher function from AES setup through BCryptDecrypt.
|
| 3 |
+
Based on findings:
|
| 4 |
+
- SHA256 provider at file 0x0015a3e2 (RVA 0x0015afe2)
|
| 5 |
+
- AES provider at file 0x0015a702 (RVA 0x0015b302)
|
| 6 |
+
- ChainingModeCFB at file 0x0015a7cd (RVA 0x0015b3cd)
|
| 7 |
+
- MessageBlockLength at file 0x0015a7fc (RVA 0x0015b3fc)
|
| 8 |
+
- BCryptGenerateSymmetricKey import at ~0x027ef0a2
|
| 9 |
+
- Need to find: key handling, IV passing, BCryptDecrypt call
|
| 10 |
+
"""
|
| 11 |
+
import struct
|
| 12 |
+
from capstone import Cs, CS_ARCH_X86, CS_MODE_64
|
| 13 |
+
|
| 14 |
+
DLL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 15 |
+
IMAGE_BASE = 0x180000000
|
| 16 |
+
TEXT_VA = 0x1000
|
| 17 |
+
TEXT_FILE_OFFSET = 0x400 # .text section file offset
|
| 18 |
+
|
| 19 |
+
def rva_to_file(rva):
|
| 20 |
+
return rva - TEXT_VA + TEXT_FILE_OFFSET
|
| 21 |
+
|
| 22 |
+
def file_to_rva(foff):
|
| 23 |
+
return foff - TEXT_FILE_OFFSET + TEXT_VA
|
| 24 |
+
|
| 25 |
+
with open(DLL_PATH, "rb") as f:
|
| 26 |
+
dll_data = f.read()
|
| 27 |
+
|
| 28 |
+
md = Cs(CS_ARCH_X86, CS_MODE_64)
|
| 29 |
+
md.detail = False
|
| 30 |
+
|
| 31 |
+
def disasm_region(name, file_start, file_end):
|
| 32 |
+
rva_start = file_to_rva(file_start)
|
| 33 |
+
va_start = IMAGE_BASE + rva_start
|
| 34 |
+
code = dll_data[file_start:file_end]
|
| 35 |
+
print(f"\n{'='*100}")
|
| 36 |
+
print(f"{name}")
|
| 37 |
+
print(f"File: 0x{file_start:08x}-0x{file_end:08x}, RVA: 0x{rva_start:08x}, VA: 0x{va_start:016x}")
|
| 38 |
+
print(f"{'='*100}")
|
| 39 |
+
for insn in md.disasm(code, va_start):
|
| 40 |
+
foff = rva_to_file(insn.address - IMAGE_BASE)
|
| 41 |
+
print(f" {insn.address - IMAGE_BASE:08x} ({foff:08x}): {insn.bytes.hex():<40s} {insn.mnemonic:<14s} {insn.op_str}")
|
| 42 |
+
|
| 43 |
+
# The Cipher function appears to start before the AES setup.
|
| 44 |
+
# Let's find the function prologue by scanning backwards from the AES setup.
|
| 45 |
+
# The AES LEA is at file 0x0015a702. Let's look for a typical function prologue.
|
| 46 |
+
|
| 47 |
+
# First, let's find the actual function start
|
| 48 |
+
# Look for common prologues (push rbp, sub rsp, mov [rsp+...], etc.) before the AES reference
|
| 49 |
+
print("\n" + "="*100)
|
| 50 |
+
print("SCANNING FOR FUNCTION PROLOGUE before AES setup (file 0x0015a702)")
|
| 51 |
+
print("="*100)
|
| 52 |
+
|
| 53 |
+
# Search backwards from 0x0015a702 for push rbp or sub rsp patterns
|
| 54 |
+
search_start = 0x0015a500 # Start from after SHA256Hash function
|
| 55 |
+
search_end = 0x0015a710
|
| 56 |
+
search_region = dll_data[search_start:search_end]
|
| 57 |
+
|
| 58 |
+
# Look for common x64 function prologues
|
| 59 |
+
# 48 89 5C 24 xx = mov [rsp+xx], rbx
|
| 60 |
+
# 48 89 74 24 xx = mov [rsp+xx], rsi
|
| 61 |
+
# 55 = push rbp
|
| 62 |
+
# 40 55 = push rbp (with REX prefix)
|
| 63 |
+
# 48 8B EC = mov rbp, rsp
|
| 64 |
+
# 48 81 EC xx xx xx xx = sub rsp, imm32
|
| 65 |
+
|
| 66 |
+
for i in range(len(search_region) - 4):
|
| 67 |
+
b = search_region[i:i+8]
|
| 68 |
+
foff = search_start + i
|
| 69 |
+
rva = file_to_rva(foff)
|
| 70 |
+
|
| 71 |
+
# Look for function start patterns
|
| 72 |
+
if b[:5] == bytes([0x48, 0x89, 0x5C, 0x24, 0x08]): # mov [rsp+8], rbx
|
| 73 |
+
print(f" Possible prologue at file 0x{foff:08x} (RVA 0x{rva:08x}): mov [rsp+8], rbx")
|
| 74 |
+
elif b[:2] == bytes([0x40, 0x55]): # push rbp with REX
|
| 75 |
+
print(f" Possible prologue at file 0x{foff:08x} (RVA 0x{rva:08x}): REX push rbp")
|
| 76 |
+
elif b[:1] == bytes([0x55]) and (i == 0 or search_region[i-1] in (0xC3, 0xCC, 0x90)):
|
| 77 |
+
print(f" Possible prologue at file 0x{foff:08x} (RVA 0x{rva:08x}): push rbp (after ret/nop/int3)")
|
| 78 |
+
elif b[:4] == bytes([0x48, 0x83, 0xEC, 0x28]): # sub rsp, 0x28
|
| 79 |
+
print(f" Possible prologue at file 0x{foff:08x} (RVA 0x{rva:08x}): sub rsp, 0x28")
|
| 80 |
+
elif b[:3] == bytes([0x48, 0x81, 0xEC]): # sub rsp, imm32
|
| 81 |
+
val = struct.unpack_from('<I', b, 3)[0]
|
| 82 |
+
print(f" Possible prologue at file 0x{foff:08x} (RVA 0x{rva:08x}): sub rsp, 0x{val:X}")
|
| 83 |
+
|
| 84 |
+
# Now disassemble the ENTIRE Cipher function region - from after SHA256Hash to well past all setup
|
| 85 |
+
# The function is large, so let's do it in meaningful chunks
|
| 86 |
+
|
| 87 |
+
# Region 1: Function start to AES provider setup
|
| 88 |
+
disasm_region(
|
| 89 |
+
"Cipher function part 1: prologue to AES provider",
|
| 90 |
+
0x0015a500, 0x0015a720
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Region 2: AES provider setup through ChainingMode and MessageBlockLength
|
| 94 |
+
disasm_region(
|
| 95 |
+
"Cipher function part 2: AES provider, ChainingModeCFB, MessageBlockLength",
|
| 96 |
+
0x0015a720, 0x0015a880
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Region 3: After IV extraction, BCryptGenerateSymmetricKey, BCryptDecrypt calls
|
| 100 |
+
# This is the critical region we need
|
| 101 |
+
disasm_region(
|
| 102 |
+
"Cipher function part 3: key gen and decrypt (extended)",
|
| 103 |
+
0x0015abd0, 0x0015ae00
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Also check what's around the BCryptDecrypt import call
|
| 107 |
+
# BCrypt imports are indirect calls through IAT
|
| 108 |
+
# Let's find all indirect calls (FF 15) in the cipher function range
|
| 109 |
+
print("\n" + "="*100)
|
| 110 |
+
print("ALL INDIRECT CALLS (ff 15) in Cipher function region 0x0015a500-0x0015ae00")
|
| 111 |
+
print("="*100)
|
| 112 |
+
|
| 113 |
+
search_start = 0x0015a500
|
| 114 |
+
search_end = 0x0015ae00
|
| 115 |
+
for i in range(search_end - search_start - 6):
|
| 116 |
+
foff = search_start + i
|
| 117 |
+
if dll_data[foff] == 0xFF and dll_data[foff+1] == 0x15:
|
| 118 |
+
rva = file_to_rva(foff)
|
| 119 |
+
disp = struct.unpack_from('<i', dll_data, foff + 2)[0]
|
| 120 |
+
target_rva = rva + 6 + disp # RIP-relative
|
| 121 |
+
target_foff = rva_to_file(target_rva)
|
| 122 |
+
# Read the IAT entry (8 bytes at the target)
|
| 123 |
+
iat_value = struct.unpack_from('<Q', dll_data, target_foff)[0] if target_foff + 8 <= len(dll_data) else 0
|
| 124 |
+
print(f" File 0x{foff:08x} (RVA 0x{rva:08x}): call [rip+0x{disp:x}] -> IAT at RVA 0x{target_rva:08x}")
|
| 125 |
+
|
| 126 |
+
# Also disassemble the region between IV handling (0x0015abdb) and magic number check (0x0015a170)
|
| 127 |
+
# This might contain the actual BCryptDecrypt call
|
| 128 |
+
disasm_region(
|
| 129 |
+
"Cipher function part 4: from end of IV path to function cleanup",
|
| 130 |
+
0x0015ac00, 0x0015ae00
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Look for the region right before the magic number check function
|
| 134 |
+
# The Cipher function should return, and then a caller invokes the magic check
|
| 135 |
+
disasm_region(
|
| 136 |
+
"Pre-magic-check function caller",
|
| 137 |
+
0x0015a0c0, 0x0015a170
|
| 138 |
+
)
|
_archive/attempts/disasm_proper.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Proper disassembly of the Cipher function in oneocr.dll using capstone.
|
| 3 |
+
Focus on the crypto setup flow: key derivation, IV, AES parameters.
|
| 4 |
+
"""
|
| 5 |
+
import struct
|
| 6 |
+
from capstone import Cs, CS_ARCH_X86, CS_MODE_64
|
| 7 |
+
|
| 8 |
+
dll_path = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 9 |
+
with open(dll_path, "rb") as f:
|
| 10 |
+
data = f.read()
|
| 11 |
+
|
| 12 |
+
# PE parsing (simplified)
|
| 13 |
+
pe_sig_offset = struct.unpack_from("<I", data, 0x3C)[0]
|
| 14 |
+
coff_start = pe_sig_offset + 4
|
| 15 |
+
opt_header_size = struct.unpack_from("<H", data, coff_start + 16)[0]
|
| 16 |
+
opt_start = coff_start + 20
|
| 17 |
+
image_base = struct.unpack_from("<Q", data, opt_start + 24)[0]
|
| 18 |
+
num_sections = struct.unpack_from("<H", data, coff_start + 2)[0]
|
| 19 |
+
section_start = opt_start + opt_header_size
|
| 20 |
+
|
| 21 |
+
sections = []
|
| 22 |
+
for i in range(num_sections):
|
| 23 |
+
s_off = section_start + i * 40
|
| 24 |
+
name = data[s_off:s_off+8].rstrip(b"\x00").decode("ascii", errors="replace")
|
| 25 |
+
vsize = struct.unpack_from("<I", data, s_off + 8)[0]
|
| 26 |
+
va = struct.unpack_from("<I", data, s_off + 12)[0]
|
| 27 |
+
raw_size = struct.unpack_from("<I", data, s_off + 16)[0]
|
| 28 |
+
raw_ptr = struct.unpack_from("<I", data, s_off + 20)[0]
|
| 29 |
+
sections.append((name, va, vsize, raw_ptr, raw_size))
|
| 30 |
+
|
| 31 |
+
def rva_to_file_offset(rva):
|
| 32 |
+
for name, va, vsize, raw_ptr, raw_size in sections:
|
| 33 |
+
if va <= rva < va + vsize:
|
| 34 |
+
return raw_ptr + (rva - va)
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
def file_offset_to_rva(offset):
|
| 38 |
+
for name, va, vsize, raw_ptr, raw_size in sections:
|
| 39 |
+
if raw_ptr <= offset < raw_ptr + raw_size:
|
| 40 |
+
return va + (offset - raw_ptr)
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
md = Cs(CS_ARCH_X86, CS_MODE_64)
|
| 44 |
+
md.detail = True
|
| 45 |
+
|
| 46 |
+
def disasm_region(file_start, file_end, label=""):
|
| 47 |
+
"""Disassemble a region and print instructions."""
|
| 48 |
+
code_bytes = data[file_start:file_end]
|
| 49 |
+
base_rva = file_offset_to_rva(file_start)
|
| 50 |
+
base_addr = image_base + base_rva
|
| 51 |
+
|
| 52 |
+
print(f"\n{'='*80}")
|
| 53 |
+
print(f"{label}")
|
| 54 |
+
print(f"File: 0x{file_start:08x}-0x{file_end:08x}, RVA: 0x{base_rva:08x}, VA: 0x{base_addr:016x}")
|
| 55 |
+
print(f"{'='*80}")
|
| 56 |
+
|
| 57 |
+
for instr in md.disasm(code_bytes, base_addr):
|
| 58 |
+
file_off = file_start + (instr.address - base_addr)
|
| 59 |
+
rva = base_rva + (instr.address - base_addr)
|
| 60 |
+
|
| 61 |
+
hex_bytes = " ".join(f"{b:02x}" for b in instr.bytes)
|
| 62 |
+
print(f" {rva:08x} ({file_off:08x}): {hex_bytes:<30s} {instr.mnemonic:10s} {instr.op_str}")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# Key code regions to disassemble (from our earlier analysis)
|
| 66 |
+
# These are file offsets where important crypto code is
|
| 67 |
+
regions = [
|
| 68 |
+
# SHA256 provider setup
|
| 69 |
+
(0x0015a3a0, 0x0015a500, "SHA256Hash function - BCryptOpenAlgorithmProvider for SHA256"),
|
| 70 |
+
|
| 71 |
+
# AES provider setup and ChainingMode/MessageBlockLength
|
| 72 |
+
(0x0015a6b0, 0x0015a880, "Cipher function - AES setup, ChainingModeCFB, MessageBlockLength"),
|
| 73 |
+
|
| 74 |
+
# Key generation and decrypt/encrypt
|
| 75 |
+
(0x0015a880, 0x0015aA00, "Cipher function - key generation and encrypt/decrypt"),
|
| 76 |
+
|
| 77 |
+
# Magic number check and uncompress
|
| 78 |
+
(0x0015a170, 0x0015a300, "Magic number check and uncompress"),
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
for file_start, file_end, label in regions:
|
| 82 |
+
disasm_region(file_start, file_end, label)
|
| 83 |
+
|
| 84 |
+
# Also look for the function that calls BCryptDecrypt
|
| 85 |
+
# BCryptDecrypt is called via an indirect call through the import table
|
| 86 |
+
# Let me find the BCryptDecrypt IAT entry
|
| 87 |
+
print("\n\n=== Finding BCryptDecrypt call sites ===")
|
| 88 |
+
|
| 89 |
+
# The call at 0015b3de: ff 15 23 f2 6b 00 is CALL [rip+0x006bf223]
|
| 90 |
+
# This is an indirect call through the IAT
|
| 91 |
+
# Let me find similar patterns near the ChainingModeCFB reference
|
| 92 |
+
# After ChainingMode and MessageBlockLength are set, the next step is GenerateSymmetricKey
|
| 93 |
+
|
| 94 |
+
# Disassemble the broader decrypt region
|
| 95 |
+
disasm_region(0x0015a880, 0x0015abe0, "Post-setup: key generation, IV, encrypt/decrypt")
|
_archive/attempts/discover_key_derivation.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Discover key derivation: what SHA256 input produces each chunk's secret key?"""
|
| 2 |
+
import hashlib
|
| 3 |
+
import struct
|
| 4 |
+
|
| 5 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 6 |
+
IV = b"Copyright @ OneO"
|
| 7 |
+
|
| 8 |
+
# Captured per-chunk secrets from hook
|
| 9 |
+
CHUNK_SECRETS = {
|
| 10 |
+
0: bytes.fromhex("d13142a17603a8e25c9ca2f90761f7fdf31ad106fd224fb7fe6a33e695c0f25a"), # DX index
|
| 11 |
+
1: bytes.fromhex("82aa42940241cc1ef7b72b3b8a22acd7f1eac465069c4b375d129f304dbd9363"), # Config
|
| 12 |
+
2: bytes.fromhex("af1442f4972ca3254d4b496c6c1c55e071a808089f814957c7002c4762fecd15"), # ONNX encrypt+chunk
|
| 13 |
+
3: bytes.fromhex("1bc0a4cfe390d35e0597d4a67451d9c8f62f53df962804a6e6907cddb3d0004b"), # Big ONNX model
|
| 14 |
+
4: bytes.fromhex("c1e03295f3793ee74c685bfe3872ec795e76f731e939abfd09120ada886a9228"), # ONNX model
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
print("=" * 70)
|
| 18 |
+
print("SHA256 Key Derivation Discovery")
|
| 19 |
+
print("=" * 70)
|
| 20 |
+
print(f"Master key: {KEY!r}")
|
| 21 |
+
print(f"SHA256(key) = {hashlib.sha256(KEY).hexdigest()}")
|
| 22 |
+
print()
|
| 23 |
+
|
| 24 |
+
# Test various derivation schemes
|
| 25 |
+
def try_hash(label, data, target_idx=None):
|
| 26 |
+
h = hashlib.sha256(data).digest()
|
| 27 |
+
for idx, secret in CHUNK_SECRETS.items():
|
| 28 |
+
if target_idx is not None and idx != target_idx:
|
| 29 |
+
continue
|
| 30 |
+
if h == secret:
|
| 31 |
+
print(f" *** MATCH chunk {idx}! *** {label} -> {h.hex()}")
|
| 32 |
+
return True
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
print("--- Simple hashes ---")
|
| 36 |
+
try_hash("SHA256(key)", KEY)
|
| 37 |
+
try_hash("SHA256(IV)", IV)
|
| 38 |
+
try_hash("SHA256(key+IV)", KEY + IV)
|
| 39 |
+
try_hash("SHA256(IV+key)", IV + KEY)
|
| 40 |
+
|
| 41 |
+
print("\n--- Key + counter ---")
|
| 42 |
+
for i in range(10):
|
| 43 |
+
try_hash(f"SHA256(key + uint8({i}))", KEY + bytes([i]))
|
| 44 |
+
try_hash(f"SHA256(key + uint32LE({i}))", KEY + struct.pack('<I', i))
|
| 45 |
+
try_hash(f"SHA256(key + uint64LE({i}))", KEY + struct.pack('<Q', i))
|
| 46 |
+
try_hash(f"SHA256(uint8({i}) + key)", bytes([i]) + KEY)
|
| 47 |
+
try_hash(f"SHA256(uint32LE({i}) + key)", struct.pack('<I', i) + KEY)
|
| 48 |
+
try_hash(f"SHA256(uint64LE({i}) + key)", struct.pack('<Q', i) + KEY)
|
| 49 |
+
|
| 50 |
+
print("\n--- Key + string counter ---")
|
| 51 |
+
for i in range(10):
|
| 52 |
+
try_hash(f"SHA256(key + '{i}')", KEY + str(i).encode())
|
| 53 |
+
try_hash(f"SHA256('{i}' + key)", str(i).encode() + KEY)
|
| 54 |
+
|
| 55 |
+
print("\n--- Double hash ---")
|
| 56 |
+
h1 = hashlib.sha256(KEY).digest()
|
| 57 |
+
try_hash("SHA256(SHA256(key))", h1)
|
| 58 |
+
for i in range(10):
|
| 59 |
+
try_hash(f"SHA256(SHA256(key) + uint8({i}))", h1 + bytes([i]))
|
| 60 |
+
try_hash(f"SHA256(SHA256(key) + uint32LE({i}))", h1 + struct.pack('<I', i))
|
| 61 |
+
|
| 62 |
+
print("\n--- HMAC-SHA256 ---")
|
| 63 |
+
import hmac
|
| 64 |
+
for i in range(10):
|
| 65 |
+
h = hmac.new(KEY, bytes([i]), hashlib.sha256).digest()
|
| 66 |
+
for idx, secret in CHUNK_SECRETS.items():
|
| 67 |
+
if h == secret:
|
| 68 |
+
print(f" *** MATCH chunk {idx}! *** HMAC(key, uint8({i}))")
|
| 69 |
+
h = hmac.new(KEY, struct.pack('<I', i), hashlib.sha256).digest()
|
| 70 |
+
for idx, secret in CHUNK_SECRETS.items():
|
| 71 |
+
if h == secret:
|
| 72 |
+
print(f" *** MATCH chunk {idx}! *** HMAC(key, uint32LE({i}))")
|
| 73 |
+
|
| 74 |
+
# Read file header data that might be used in derivation
|
| 75 |
+
from pathlib import Path
|
| 76 |
+
file_data = Path("ocr_data/oneocr.onemodel").read_bytes()
|
| 77 |
+
header = file_data[:24] # First 24 bytes (before encrypted DX)
|
| 78 |
+
print(f"\nFile header (offset 0-23): {header.hex()}")
|
| 79 |
+
header_size = struct.unpack('<I', file_data[:4])[0]
|
| 80 |
+
print(f"Header size field: {header_size}")
|
| 81 |
+
|
| 82 |
+
print("\n--- Key + file header data ---")
|
| 83 |
+
try_hash("SHA256(key + header[:8])", KEY + header[:8])
|
| 84 |
+
try_hash("SHA256(key + header[:16])", KEY + header[:16])
|
| 85 |
+
try_hash("SHA256(key + header[:24])", KEY + header[:24])
|
| 86 |
+
try_hash("SHA256(header[:8] + key)", header[:8] + KEY)
|
| 87 |
+
try_hash("SHA256(header[:16] + key)", header[:16] + KEY)
|
| 88 |
+
try_hash("SHA256(header[:24] + key)", header[:24] + KEY)
|
| 89 |
+
|
| 90 |
+
# Try with known offsets
|
| 91 |
+
print("\n--- Key + chunk offset ---")
|
| 92 |
+
offsets = [24, 22648, 22684] # Known offsets
|
| 93 |
+
for off in offsets:
|
| 94 |
+
try_hash(f"SHA256(key + uint32LE({off}))", KEY + struct.pack('<I', off))
|
| 95 |
+
try_hash(f"SHA256(key + uint64LE({off}))", KEY + struct.pack('<Q', off))
|
| 96 |
+
|
| 97 |
+
# Try with chunk sizes
|
| 98 |
+
print("\n--- Key + chunk sizes ---")
|
| 99 |
+
sizes = [22624, 11920, 11553680]
|
| 100 |
+
for sz in sizes:
|
| 101 |
+
try_hash(f"SHA256(key + uint32LE({sz}))", KEY + struct.pack('<I', sz))
|
| 102 |
+
|
| 103 |
+
# Try iterative: SHA256(key), SHA256(prev_hash), ...
|
| 104 |
+
print("\n--- Iterative hashing (chain) ---")
|
| 105 |
+
h = KEY
|
| 106 |
+
for i in range(10):
|
| 107 |
+
h = hashlib.sha256(h).digest()
|
| 108 |
+
for idx, secret in CHUNK_SECRETS.items():
|
| 109 |
+
if h == secret:
|
| 110 |
+
print(f" *** MATCH chunk {idx}! *** SHA256^{i+1}(key)")
|
| 111 |
+
|
| 112 |
+
# Try key + IV combos
|
| 113 |
+
print("\n--- Key + IV + counter ---")
|
| 114 |
+
for i in range(10):
|
| 115 |
+
try_hash(f"SHA256(key + IV + uint8({i}))", KEY + IV + bytes([i]))
|
| 116 |
+
try_hash(f"SHA256(IV + key + uint8({i}))", IV + KEY + bytes([i]))
|
| 117 |
+
try_hash(f"SHA256(key + uint8({i}) + IV)", KEY + bytes([i]) + IV)
|
| 118 |
+
|
| 119 |
+
# Try XOR-based derivation
|
| 120 |
+
print("\n--- XOR key with counter ---")
|
| 121 |
+
for i in range(10):
|
| 122 |
+
xor_key = bytes(b ^ i for b in KEY)
|
| 123 |
+
try_hash(f"SHA256(key XOR {i})", xor_key)
|
| 124 |
+
|
| 125 |
+
print("\n--- If no match found, need to hook BCryptHash/BCryptHashData ---")
|
| 126 |
+
print("to see exact SHA256 input data")
|
_archive/attempts/dll_bcrypt_analysis.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
data = open(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll", "rb").read()
|
| 4 |
+
|
| 5 |
+
# Find ALL BCrypt function occurrences
|
| 6 |
+
print("=== All BCrypt function references ===")
|
| 7 |
+
for m in re.finditer(b'BCrypt\w+', data):
|
| 8 |
+
offset = m.start()
|
| 9 |
+
name = m.group().decode('ascii')
|
| 10 |
+
print(f" [0x{offset:08x}] {name}")
|
| 11 |
+
|
| 12 |
+
# Search for BCryptGenerateSymmetricKey and BCryptImportKey specifically
|
| 13 |
+
print()
|
| 14 |
+
for fn in [b"BCryptGenerateSymmetricKey", b"BCryptImportKey", b"BCryptCreateHash"]:
|
| 15 |
+
pos = data.find(fn)
|
| 16 |
+
print(f" {fn.decode()}: {'FOUND at 0x' + format(pos, '08x') if pos != -1 else 'NOT FOUND'}")
|
| 17 |
+
|
| 18 |
+
# Look for MAGIC_NUMBER constant value
|
| 19 |
+
print()
|
| 20 |
+
print("=== Looking for MAGIC_NUMBER = 1 constant context ===")
|
| 21 |
+
for pattern in [b"magic_number == MAGIC_NUMBER"]:
|
| 22 |
+
pos = data.find(pattern)
|
| 23 |
+
while pos != -1:
|
| 24 |
+
# Dump wider context
|
| 25 |
+
ctx_start = max(0, pos - 100)
|
| 26 |
+
ctx_end = min(len(data), pos + 100)
|
| 27 |
+
ctx = data[ctx_start:ctx_end]
|
| 28 |
+
# Find strings in context
|
| 29 |
+
for m in re.finditer(b'[\x20-\x7e]{4,}', ctx):
|
| 30 |
+
print(f" [0x{ctx_start + m.start():08x}] {m.group().decode('ascii')}")
|
| 31 |
+
pos = data.find(pattern, pos + 1)
|
| 32 |
+
if pos == -1:
|
| 33 |
+
break
|
| 34 |
+
|
| 35 |
+
# Look at the region right around the crypto strings for more context
|
| 36 |
+
print()
|
| 37 |
+
print("=== Extended crypto region dump 0x02724b00-0x02724d00 ===")
|
| 38 |
+
for i in range(0x02724b00, 0x02724d00, 16):
|
| 39 |
+
chunk = data[i:i+16]
|
| 40 |
+
hex_part = " ".join(f"{b:02x}" for b in chunk)
|
| 41 |
+
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in chunk)
|
| 42 |
+
print(f" {i:08x}: {hex_part:<48s} {ascii_part}")
|
| 43 |
+
|
| 44 |
+
# Check for constant values near magic_number assertion - look for "1" as uint32
|
| 45 |
+
# Find the code that references the magic_number string
|
| 46 |
+
print()
|
| 47 |
+
print("=== Finding code references to Crypto.cpp ===")
|
| 48 |
+
crypto_path = b"C:\\__w\\1\\s\\CoreEngine\\Native\\ModelParser\\Crypto.cpp"
|
| 49 |
+
pos = data.find(crypto_path)
|
| 50 |
+
if pos != -1:
|
| 51 |
+
# This is in the .rdata section. Find cross-references to this address
|
| 52 |
+
# In x64, look for LEA instructions referencing this RVA
|
| 53 |
+
print(f" Crypto.cpp string at: 0x{pos:08x}")
|
| 54 |
+
|
| 55 |
+
# Look for the "block length" being set - find 16 as a byte constant near BlockLength string
|
| 56 |
+
print()
|
| 57 |
+
print("=== Looking for block length values near crypto code ===")
|
| 58 |
+
bl_str = data.find(b"B\x00l\x00o\x00c\x00k\x00L\x00e\x00n\x00g\x00t\x00h\x00")
|
| 59 |
+
if bl_str != -1:
|
| 60 |
+
print(f" BlockLength wide string at: 0x{bl_str:08x}")
|
| 61 |
+
ml_str = data.find(b"M\x00e\x00s\x00s\x00a\x00g\x00e\x00B\x00l\x00o\x00c\x00k\x00L\x00e\x00n\x00g\x00t\x00h\x00")
|
| 62 |
+
if ml_str != -1:
|
| 63 |
+
print(f" MessageBlockLength wide string at: 0x{ml_str:08x}")
|
_archive/attempts/dll_crypto_analysis.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deep analysis of oneocr.dll to find the exact decryption algorithm.
|
| 3 |
+
Searches for Crypto.cpp references, key/IV derivation patterns, and
|
| 4 |
+
the structure of .onemodel container format.
|
| 5 |
+
"""
|
| 6 |
+
import struct
|
| 7 |
+
import re
|
| 8 |
+
import os
|
| 9 |
+
from collections import Counter
|
| 10 |
+
import math
|
| 11 |
+
|
| 12 |
+
DLL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
|
| 13 |
+
MODEL_PATH = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
|
| 14 |
+
|
| 15 |
+
with open(DLL_PATH, "rb") as f:
|
| 16 |
+
dll = f.read()
|
| 17 |
+
|
| 18 |
+
with open(MODEL_PATH, "rb") as f:
|
| 19 |
+
model = f.read()
|
| 20 |
+
|
| 21 |
+
def entropy_calc(data):
|
| 22 |
+
if not data:
|
| 23 |
+
return 0.0
|
| 24 |
+
freq = Counter(data)
|
| 25 |
+
total = len(data)
|
| 26 |
+
return -sum((c/total) * math.log2(c/total) for c in freq.values())
|
| 27 |
+
|
| 28 |
+
print("=" * 80)
|
| 29 |
+
print("PHASE 1: Crypto.cpp and error message strings")
|
| 30 |
+
print("=" * 80)
|
| 31 |
+
|
| 32 |
+
search_strings = [
|
| 33 |
+
b'Crypto.cpp', b'magic_number', b'MAGIC_NUMBER', b'uncompress',
|
| 34 |
+
b'Uncompress', b'Source data', b'mismatch', b'Check failed',
|
| 35 |
+
b'Unable to', b'model_data', b'ModelData', b'decrypt', b'Decrypt',
|
| 36 |
+
b'LoadModel', b'load_model', b'onemodel', b'.onemodel',
|
| 37 |
+
b'ParseModel', b'DeserializeModel', b'ReadModel',
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
all_found = set()
|
| 41 |
+
for term in search_strings:
|
| 42 |
+
for m in re.finditer(re.escape(term), dll, re.IGNORECASE):
|
| 43 |
+
offset = m.start()
|
| 44 |
+
s = offset
|
| 45 |
+
while s > 0 and 0x20 <= dll[s-1] < 0x7f:
|
| 46 |
+
s -= 1
|
| 47 |
+
e = offset + len(term)
|
| 48 |
+
while e < len(dll) and 0x20 <= dll[e] < 0x7f:
|
| 49 |
+
e += 1
|
| 50 |
+
full = dll[s:e].decode('ascii', errors='ignore')
|
| 51 |
+
if full not in all_found and len(full) > 3:
|
| 52 |
+
all_found.add(full)
|
| 53 |
+
print(f" 0x{s:08x}: {full[:250]}")
|
| 54 |
+
|
| 55 |
+
print(f"\n" + "=" * 80)
|
| 56 |
+
print("PHASE 2: Compression library strings")
|
| 57 |
+
print("=" * 80)
|
| 58 |
+
|
| 59 |
+
for pattern in [b'uncompress', b'compress', b'inflate', b'deflate',
|
| 60 |
+
b'lz4', b'LZ4', b'snappy', b'Snappy', b'zstd', b'ZSTD',
|
| 61 |
+
b'zlib', b'ZLIB', b'brotli', b'lzma', b'LZMA']:
|
| 62 |
+
idx = 0
|
| 63 |
+
seen = set()
|
| 64 |
+
while True:
|
| 65 |
+
idx = dll.find(pattern, idx)
|
| 66 |
+
if idx < 0:
|
| 67 |
+
break
|
| 68 |
+
s = idx
|
| 69 |
+
while s > 0 and 0x20 <= dll[s-1] < 0x7f:
|
| 70 |
+
s -= 1
|
| 71 |
+
e = idx + len(pattern)
|
| 72 |
+
while e < len(dll) and 0x20 <= dll[e] < 0x7f:
|
| 73 |
+
e += 1
|
| 74 |
+
full = dll[s:e].decode('ascii', errors='ignore')
|
| 75 |
+
if full not in seen and len(full) > 3:
|
| 76 |
+
seen.add(full)
|
| 77 |
+
print(f" 0x{s:08x}: {full[:200]}")
|
| 78 |
+
idx = e
|
| 79 |
+
|
| 80 |
+
print(f"\n" + "=" * 80)
|
| 81 |
+
print("PHASE 3: .onemodel file structure analysis")
|
| 82 |
+
print("=" * 80)
|
| 83 |
+
|
| 84 |
+
filesize = len(model)
|
| 85 |
+
h_size = struct.unpack_from("<I", model, 0)[0] # 22636
|
| 86 |
+
|
| 87 |
+
print(f"File size: {filesize:,} bytes ({filesize/1024/1024:.2f} MB)")
|
| 88 |
+
print(f"Header size (uint32 @ 0): {h_size}")
|
| 89 |
+
|
| 90 |
+
# Detailed header boundary analysis
|
| 91 |
+
print(f"\nAt header boundary (offset {h_size}):")
|
| 92 |
+
for i in range(0, 64, 4):
|
| 93 |
+
off = h_size + i
|
| 94 |
+
val32 = struct.unpack_from("<I", model, off)[0]
|
| 95 |
+
print(f" @{off:6d} (+{i:2d}): u32={val32:>12,} (0x{val32:08x}) hex={model[off:off+4].hex()}")
|
| 96 |
+
|
| 97 |
+
# Critical check: does any uint64 at header boundary == remaining data?
|
| 98 |
+
print(f"\nSize field search at header boundary:")
|
| 99 |
+
for i in range(0, 32, 4):
|
| 100 |
+
off = h_size + i
|
| 101 |
+
if off + 8 <= filesize:
|
| 102 |
+
val64 = struct.unpack_from("<Q", model, off)[0]
|
| 103 |
+
remaining = filesize - (off + 8)
|
| 104 |
+
diff = abs(val64 - remaining)
|
| 105 |
+
if diff < 1000:
|
| 106 |
+
print(f" *** @{off} (+{i}): u64={val64:,} remaining={remaining:,} diff={diff}")
|
| 107 |
+
|
| 108 |
+
# Check header entropy pattern
|
| 109 |
+
print(f"\nHeader entropy (256-byte chunks):")
|
| 110 |
+
for chunk_start in range(0, h_size, 256):
|
| 111 |
+
chunk_end = min(chunk_start + 256, h_size)
|
| 112 |
+
chunk = model[chunk_start:chunk_end]
|
| 113 |
+
ent = entropy_calc(chunk)
|
| 114 |
+
uniq = len(set(chunk))
|
| 115 |
+
tag = " ← STRUCTURED!" if ent < 5.0 else (" ← moderate" if ent < 7.0 else "")
|
| 116 |
+
if ent < 6.0 or chunk_start < 256 or chunk_start >= h_size - 256:
|
| 117 |
+
print(f" [{chunk_start:5d}:{chunk_end:5d}] ent={ent:.3f} uniq={uniq:3d}/256{tag}")
|
| 118 |
+
|
| 119 |
+
# Search for substructures within header: look for recurring uint32 patterns
|
| 120 |
+
print(f"\nSearching for structure markers in header (first 100 bytes):")
|
| 121 |
+
for i in range(0, min(100, h_size), 4):
|
| 122 |
+
val = struct.unpack_from("<I", model, i)[0]
|
| 123 |
+
if val < 1000 or (1000000 < val < filesize):
|
| 124 |
+
print(f" @{i:4d}: u32={val:>12,} (0x{val:08x})")
|
| 125 |
+
|
| 126 |
+
print(f"\n" + "=" * 80)
|
| 127 |
+
print("PHASE 4: Sub-model references in DLL")
|
| 128 |
+
print("=" * 80)
|
| 129 |
+
|
| 130 |
+
submodel_patterns = [
|
| 131 |
+
b'detector', b'Detector', b'recognizer', b'Recognizer',
|
| 132 |
+
b'normalizer', b'Normalizer', b'classifier', b'Classifier',
|
| 133 |
+
b'dispatch', b'Dispatch', b'barcode', b'Barcode',
|
| 134 |
+
b'text_detect', b'text_recog', b'TextDetect', b'TextRecog',
|
| 135 |
+
b'CTC', b'transformer', b'Transformer',
|
| 136 |
+
b'model_type', b'ModelType', b'model_name', b'ModelName',
|
| 137 |
+
b'sub_model', b'SubModel', b'segment',
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
found = set()
|
| 141 |
+
for pattern in submodel_patterns:
|
| 142 |
+
for m in re.finditer(pattern, dll, re.IGNORECASE):
|
| 143 |
+
s = m.start()
|
| 144 |
+
while s > 0 and 0x20 <= dll[s-1] < 0x7f:
|
| 145 |
+
s -= 1
|
| 146 |
+
e = m.end()
|
| 147 |
+
while e < len(dll) and 0x20 <= dll[e] < 0x7f:
|
| 148 |
+
e += 1
|
| 149 |
+
full = dll[s:e].decode('ascii', errors='ignore')
|
| 150 |
+
if full not in found and 4 < len(full) < 200 and 'OneOCR' in full or 'model' in full.lower() or 'detect' in full.lower() or 'recog' in full.lower():
|
| 151 |
+
found.add(full)
|
| 152 |
+
print(f" 0x{s:08x}: {full}")
|
| 153 |
+
|
| 154 |
+
print(f"\n" + "=" * 80)
|
| 155 |
+
print("PHASE 5: ORT session creation patterns")
|
| 156 |
+
print("=" * 80)
|
| 157 |
+
|
| 158 |
+
ort_patterns = [
|
| 159 |
+
b'OrtGetApiBase', b'CreateSession', b'SessionOptions',
|
| 160 |
+
b'CreateSessionFromArray', b'OrtApi', b'InferenceSession',
|
| 161 |
+
b'SessionFromBuffer', b'CreateSessionFromBuffer',
|
| 162 |
+
b'AppendExecutionProvider', b'ModelMetadata',
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
for pattern in ort_patterns:
|
| 166 |
+
idx = 0
|
| 167 |
+
while True:
|
| 168 |
+
idx = dll.find(pattern, idx)
|
| 169 |
+
if idx < 0:
|
| 170 |
+
break
|
| 171 |
+
s = idx
|
| 172 |
+
while s > 0 and 0x20 <= dll[s-1] < 0x7f:
|
| 173 |
+
s -= 1
|
| 174 |
+
e = idx + len(pattern)
|
| 175 |
+
while e < len(dll) and 0x20 <= dll[e] < 0x7f:
|
| 176 |
+
e += 1
|
| 177 |
+
full = dll[s:e].decode('ascii', errors='ignore')
|
| 178 |
+
print(f" 0x{s:08x}: {full[:200]}")
|
| 179 |
+
idx = e
|
| 180 |
+
|
| 181 |
+
print(f"\n" + "=" * 80)
|
| 182 |
+
print("DONE")
|
| 183 |
+
print("=" * 80)
|
_archive/attempts/extract_onnx.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Extract valid ONNX models from BCryptDecrypt dumps.
|
| 2 |
+
Strips 8-byte container header and trailing garbage bytes.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import struct
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
DUMP_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 9 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\onnx_models")
|
| 10 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 11 |
+
|
| 12 |
+
CONTAINER_HEADER = bytes.fromhex("4a1a082b25000000")
|
| 13 |
+
HEADER_LEN = 8
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def read_varint(data, pos):
|
| 17 |
+
"""Read a protobuf varint. Returns (value, new_pos)."""
|
| 18 |
+
val = 0
|
| 19 |
+
shift = 0
|
| 20 |
+
while pos < len(data):
|
| 21 |
+
b = data[pos]; pos += 1
|
| 22 |
+
val |= (b & 0x7f) << shift
|
| 23 |
+
if not (b & 0x80):
|
| 24 |
+
break
|
| 25 |
+
shift += 7
|
| 26 |
+
return val, pos
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def measure_protobuf(data):
|
| 30 |
+
"""Walk through protobuf fields and return the byte length of valid data.
|
| 31 |
+
Stops at the first unknown/invalid field for ONNX ModelProto.
|
| 32 |
+
Valid fields: 1-9, 14, 20."""
|
| 33 |
+
VALID_FIELDS = {1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 20}
|
| 34 |
+
pos = 0
|
| 35 |
+
last_valid = 0
|
| 36 |
+
|
| 37 |
+
while pos < len(data):
|
| 38 |
+
start = pos
|
| 39 |
+
# Read tag
|
| 40 |
+
tag, pos = read_varint(data, pos)
|
| 41 |
+
if pos > len(data):
|
| 42 |
+
break
|
| 43 |
+
field_num = tag >> 3
|
| 44 |
+
wire_type = tag & 7
|
| 45 |
+
|
| 46 |
+
if field_num not in VALID_FIELDS:
|
| 47 |
+
return start
|
| 48 |
+
|
| 49 |
+
if wire_type == 0: # VARINT
|
| 50 |
+
_, pos = read_varint(data, pos)
|
| 51 |
+
elif wire_type == 1: # I64
|
| 52 |
+
pos += 8
|
| 53 |
+
elif wire_type == 2: # LEN
|
| 54 |
+
length, pos = read_varint(data, pos)
|
| 55 |
+
pos += length
|
| 56 |
+
elif wire_type == 5: # I32
|
| 57 |
+
pos += 4
|
| 58 |
+
else:
|
| 59 |
+
return start
|
| 60 |
+
|
| 61 |
+
if pos > len(data):
|
| 62 |
+
return start
|
| 63 |
+
last_valid = pos
|
| 64 |
+
|
| 65 |
+
return last_valid
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def try_onnx_load(filepath):
|
| 69 |
+
try:
|
| 70 |
+
import onnx
|
| 71 |
+
model = onnx.load(str(filepath))
|
| 72 |
+
return {
|
| 73 |
+
'ir_version': model.ir_version,
|
| 74 |
+
'producer': model.producer_name,
|
| 75 |
+
'producer_version': model.producer_version,
|
| 76 |
+
'opset': [f"{o.domain or 'ai.onnx'}:{o.version}" for o in model.opset_import],
|
| 77 |
+
'graph_name': model.graph.name if model.graph else None,
|
| 78 |
+
'num_nodes': len(model.graph.node) if model.graph else 0,
|
| 79 |
+
'num_inputs': len(model.graph.input) if model.graph else 0,
|
| 80 |
+
'num_outputs': len(model.graph.output) if model.graph else 0,
|
| 81 |
+
'node_types': sorted(set(n.op_type for n in model.graph.node)) if model.graph else [],
|
| 82 |
+
}
|
| 83 |
+
except Exception as e:
|
| 84 |
+
return {'error': str(e)[:200]}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def try_ort_load(filepath):
|
| 88 |
+
try:
|
| 89 |
+
import onnxruntime as ort
|
| 90 |
+
sess = ort.InferenceSession(str(filepath), providers=['CPUExecutionProvider'])
|
| 91 |
+
return {
|
| 92 |
+
'inputs': [(i.name, i.shape, i.type) for i in sess.get_inputs()],
|
| 93 |
+
'outputs': [(o.name, o.shape, o.type) for o in sess.get_outputs()],
|
| 94 |
+
}
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return {'error': str(e)[:200]}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
print("=" * 70)
|
| 100 |
+
print("EXTRACTING ONNX MODELS (WITH TRAILING GARBAGE REMOVAL)")
|
| 101 |
+
print("=" * 70)
|
| 102 |
+
|
| 103 |
+
# Clean output dir
|
| 104 |
+
for old in OUTPUT_DIR.glob("*.onnx"):
|
| 105 |
+
old.unlink()
|
| 106 |
+
|
| 107 |
+
files = sorted(DUMP_DIR.glob("decrypt_*.bin"), key=lambda f: f.stat().st_size, reverse=True)
|
| 108 |
+
print(f"Total decrypt files: {len(files)}\n")
|
| 109 |
+
|
| 110 |
+
models = []
|
| 111 |
+
non_models = []
|
| 112 |
+
|
| 113 |
+
for f in files:
|
| 114 |
+
raw = f.read_bytes()
|
| 115 |
+
|
| 116 |
+
# Strip container header
|
| 117 |
+
if raw[:HEADER_LEN] == CONTAINER_HEADER:
|
| 118 |
+
data = raw[HEADER_LEN:]
|
| 119 |
+
elif raw[:5] == CONTAINER_HEADER[:5]:
|
| 120 |
+
data = raw[HEADER_LEN:]
|
| 121 |
+
else:
|
| 122 |
+
non_models.append({'src': f.name, 'size': len(raw), 'reason': 'no container header',
|
| 123 |
+
'first_16': raw[:16].hex()})
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
# Check if data starts with valid ONNX (field 1 = ir_version, varint)
|
| 127 |
+
if len(data) < 2 or data[0] != 0x08 or data[1] < 1 or data[1] > 12:
|
| 128 |
+
preview = data[:40].decode('utf-8', errors='replace')
|
| 129 |
+
non_models.append({'src': f.name, 'size': len(raw), 'reason': 'not ONNX',
|
| 130 |
+
'preview': preview})
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Measure valid protobuf length (strip trailing garbage)
|
| 134 |
+
valid_len = measure_protobuf(data)
|
| 135 |
+
trimmed = len(data) - valid_len
|
| 136 |
+
onnx_data = data[:valid_len]
|
| 137 |
+
|
| 138 |
+
# Determine producer
|
| 139 |
+
producer = "unknown"
|
| 140 |
+
if b"PyTorch" in data[:100]:
|
| 141 |
+
producer = "pytorch"
|
| 142 |
+
elif b"onnx.quantize" in data[:100]:
|
| 143 |
+
producer = "onnx_quantize"
|
| 144 |
+
elif b"pytorch" in data[:100]:
|
| 145 |
+
producer = "pytorch_small"
|
| 146 |
+
|
| 147 |
+
ir_version = data[1]
|
| 148 |
+
|
| 149 |
+
idx = len(models)
|
| 150 |
+
fname = f"model_{idx:02d}_ir{ir_version}_{producer}_{valid_len//1024}KB.onnx"
|
| 151 |
+
outpath = OUTPUT_DIR / fname
|
| 152 |
+
outpath.write_bytes(onnx_data)
|
| 153 |
+
|
| 154 |
+
models.append({
|
| 155 |
+
'src': f.name, 'dst': fname, 'raw_size': len(raw),
|
| 156 |
+
'onnx_size': valid_len, 'trimmed': trimmed,
|
| 157 |
+
'ir_version': ir_version, 'producer': producer,
|
| 158 |
+
})
|
| 159 |
+
|
| 160 |
+
print(f"ONNX models extracted: {len(models)}")
|
| 161 |
+
print(f"Non-model files: {len(non_models)}")
|
| 162 |
+
|
| 163 |
+
# Verify all models
|
| 164 |
+
print("\n" + "=" * 70)
|
| 165 |
+
print("VERIFICATION WITH onnx + onnxruntime")
|
| 166 |
+
print("=" * 70)
|
| 167 |
+
|
| 168 |
+
verified_onnx = 0
|
| 169 |
+
verified_ort = 0
|
| 170 |
+
|
| 171 |
+
for m in models:
|
| 172 |
+
outpath = OUTPUT_DIR / m['dst']
|
| 173 |
+
|
| 174 |
+
r_onnx = try_onnx_load(outpath)
|
| 175 |
+
r_ort = try_ort_load(outpath)
|
| 176 |
+
|
| 177 |
+
onnx_ok = 'error' not in r_onnx
|
| 178 |
+
ort_ok = 'error' not in r_ort
|
| 179 |
+
|
| 180 |
+
if onnx_ok:
|
| 181 |
+
verified_onnx += 1
|
| 182 |
+
if ort_ok:
|
| 183 |
+
verified_ort += 1
|
| 184 |
+
|
| 185 |
+
status = "OK" if onnx_ok and ort_ok else ("onnx" if onnx_ok else ("ort" if ort_ok else "FAIL"))
|
| 186 |
+
|
| 187 |
+
print(f"\n [{status:>4}] {m['dst']}")
|
| 188 |
+
print(f" Raw: {m['raw_size']:>10,} -> ONNX: {m['onnx_size']:>10,} (trimmed {m['trimmed']} bytes)")
|
| 189 |
+
|
| 190 |
+
if onnx_ok:
|
| 191 |
+
r = r_onnx
|
| 192 |
+
print(f" graph='{r['graph_name']}', nodes={r['num_nodes']}, "
|
| 193 |
+
f"inputs={r['num_inputs']}, outputs={r['num_outputs']}")
|
| 194 |
+
print(f" opset: {', '.join(r['opset'][:5])}")
|
| 195 |
+
ops = r['node_types']
|
| 196 |
+
print(f" ops({len(ops)}): {', '.join(ops[:15])}")
|
| 197 |
+
if len(ops) > 15:
|
| 198 |
+
print(f" ... +{len(ops)-15} more")
|
| 199 |
+
elif ort_ok:
|
| 200 |
+
r = r_ort
|
| 201 |
+
for inp in r['inputs']:
|
| 202 |
+
print(f" Input: {inp[0]} {inp[1]} {inp[2]}")
|
| 203 |
+
for out in r['outputs']:
|
| 204 |
+
print(f" Output: {out[0]} {out[1]} {out[2]}")
|
| 205 |
+
else:
|
| 206 |
+
print(f" onnx: {r_onnx.get('error', '')[:100]}")
|
| 207 |
+
print(f" ort: {r_ort.get('error', '')[:100]}")
|
| 208 |
+
|
| 209 |
+
# Summary
|
| 210 |
+
print("\n" + "=" * 70)
|
| 211 |
+
print("FINAL SUMMARY")
|
| 212 |
+
print("=" * 70)
|
| 213 |
+
print(f"Decrypted dumps: {len(files)}")
|
| 214 |
+
print(f"ONNX models: {len(models)}")
|
| 215 |
+
print(f" - onnx.load OK: {verified_onnx}")
|
| 216 |
+
print(f" - onnxruntime OK: {verified_ort}")
|
| 217 |
+
print(f"Non-model data: {len(non_models)}")
|
| 218 |
+
|
| 219 |
+
if models:
|
| 220 |
+
total = sum(m['onnx_size'] for m in models)
|
| 221 |
+
print(f"\nTotal ONNX model size: {total:,} bytes ({total/1024/1024:.1f} MB)")
|
| 222 |
+
|
| 223 |
+
print(f"\nNon-model content:")
|
| 224 |
+
for nm in non_models[:15]:
|
| 225 |
+
desc = nm.get('preview', nm.get('first_16', ''))[:50]
|
| 226 |
+
print(f" {nm['src']}: {nm['size']:>10,} bytes | {nm['reason']} | {desc!r}")
|
| 227 |
+
|
| 228 |
+
print(f"\n{'='*70}")
|
| 229 |
+
print(f"CRYPTO PARAMS (CONFIRMED)")
|
| 230 |
+
print(f"{'='*70}")
|
| 231 |
+
print(f'Key: kj)TGtrK>f]b[Piow.gU+nC@s""""""4 (32 bytes, raw)')
|
| 232 |
+
print(f'IV: Copyright @ OneO (16 bytes)')
|
| 233 |
+
print(f"Mode: AES-256-CFB (full block, BCrypt CNG)")
|
| 234 |
+
print(f"Container: 8-byte header 4a1a082b25000000 per chunk")
|
| 235 |
+
print(f"Model: ONNX protobuf + trailing metadata (trimmed)")
|
_archive/attempts/extract_strings.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
data = open(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll", "rb").read()
|
| 4 |
+
|
| 5 |
+
all_strings = re.findall(b'[\x20-\x7e]{6,}', data)
|
| 6 |
+
crypto_keywords = [b'crypt', b'aes', b'bcrypt', b'key', b'iv', b'cipher', b'cfb', b'hash',
|
| 7 |
+
b'sha', b'magic', b'decomp', b'uncomp', b'compress', b'model', b'meta',
|
| 8 |
+
b'onnx', b'ONNX', b'decrypt', b'encrypt', b'Crypto', b'init', b'blob',
|
| 9 |
+
b'MAGIC', b'check', b'Check', b'fail', b'Fail', b'number']
|
| 10 |
+
|
| 11 |
+
print(f"Total strings: {len(all_strings)}")
|
| 12 |
+
print()
|
| 13 |
+
print("=== Crypto/model-related strings ===")
|
| 14 |
+
seen = set()
|
| 15 |
+
for s in all_strings:
|
| 16 |
+
s_lower = s.lower()
|
| 17 |
+
for kw in crypto_keywords:
|
| 18 |
+
if kw.lower() in s_lower:
|
| 19 |
+
if s not in seen:
|
| 20 |
+
seen.add(s)
|
| 21 |
+
offset = data.find(s)
|
| 22 |
+
text = s.decode("ascii", errors="replace")
|
| 23 |
+
print(f" [0x{offset:08x}] {text}")
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
# Also look for wide strings (UTF-16LE) related to BCrypt
|
| 27 |
+
print()
|
| 28 |
+
print("=== Wide (UTF-16LE) strings ===")
|
| 29 |
+
wide_strings = re.findall(b'(?:[\x20-\x7e]\x00){4,}', data)
|
| 30 |
+
for ws in wide_strings:
|
| 31 |
+
decoded = ws.decode("utf-16-le", errors="replace")
|
| 32 |
+
d_lower = decoded.lower()
|
| 33 |
+
for kw in [b'crypt', b'aes', b'cfb', b'chain', b'algorithm', b'key', b'sha', b'hash']:
|
| 34 |
+
if kw.decode().lower() in d_lower:
|
| 35 |
+
offset = data.find(ws)
|
| 36 |
+
print(f" [0x{offset:08x}] {decoded}")
|
| 37 |
+
break
|
_archive/attempts/find_offset.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Map encrypted input bytes from hook to file offsets."""
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import struct
|
| 4 |
+
|
| 5 |
+
data = Path("ocr_data/oneocr.onemodel").read_bytes()
|
| 6 |
+
|
| 7 |
+
# Encrypted input first bytes from hook (call #, first 32 enc bytes hex, chunk_size)
|
| 8 |
+
chunks_encrypted = [
|
| 9 |
+
(0, "2e0c10c7c967f66b6d03821271115ad6c19ca7d91b668e5c484018e02c9632b4", 22624),
|
| 10 |
+
(2, "f7d14a6dbd04af02b6de5e5454af59d007bb5c174e3b6be6a73513b995c7dc1a", 11920),
|
| 11 |
+
(4, "7bf021af201c559217035b95ebf758ff70c860f126c9c1529421bb2d75898bf9", 11553680),
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
print("Searching for encrypted chunk starts in file:")
|
| 15 |
+
print(f"File size: {len(data):,}")
|
| 16 |
+
print()
|
| 17 |
+
|
| 18 |
+
prev_end = 0
|
| 19 |
+
for call_num, hex_str, chunk_size in chunks_encrypted:
|
| 20 |
+
search_bytes = bytes.fromhex(hex_str[:16]) # First 8 bytes
|
| 21 |
+
idx = data.find(search_bytes)
|
| 22 |
+
if idx >= 0:
|
| 23 |
+
gap = idx - prev_end if prev_end > 0 else idx
|
| 24 |
+
print(f" Call #{call_num}: offset {idx} ({idx:#x}), size={chunk_size:,}, gap={gap}")
|
| 25 |
+
print(f" Range: [{idx:#x}, {idx+chunk_size:#x})")
|
| 26 |
+
prev_end = idx + chunk_size
|
| 27 |
+
|
| 28 |
+
full = bytes.fromhex(hex_str)
|
| 29 |
+
if data[idx:idx+len(full)] == full:
|
| 30 |
+
print(f" 32-byte match: OK")
|
| 31 |
+
else:
|
| 32 |
+
print(f" Call #{call_num}: NOT FOUND")
|
| 33 |
+
|
| 34 |
+
# File structure
|
| 35 |
+
print(f"\n--- File structure ---")
|
| 36 |
+
print(f"Offset 0: header_size = {struct.unpack_from('<I', data, 0)[0]}")
|
| 37 |
+
print(f"Offset 4: {struct.unpack_from('<I', data, 4)[0]}")
|
| 38 |
+
print(f"Offset 8-23: {data[8:24].hex()}")
|
| 39 |
+
|
| 40 |
+
chunk1_end = 24 + 22624 # = 22648
|
| 41 |
+
print(f"\nChunk 1 ends at offset {chunk1_end}")
|
| 42 |
+
for o in range(22636, 22680, 4):
|
| 43 |
+
v = struct.unpack_from('<I', data, o)[0]
|
| 44 |
+
print(f" offset {o}: {data[o:o+4].hex()} = uint32 {v} ({v:#x})")
|
_archive/attempts/frida_hook.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Frida-based hooking of BCryptDecrypt in oneocr.dll to intercept decrypted ONNX models.
|
| 3 |
+
|
| 4 |
+
Strategy:
|
| 5 |
+
1. Load oneocr.dll in a child process
|
| 6 |
+
2. Hook BCryptDecrypt in bcrypt.dll to capture decrypted output
|
| 7 |
+
3. Call CreateOcrPipeline which triggers model decryption
|
| 8 |
+
4. Save all decrypted buffers
|
| 9 |
+
"""
|
| 10 |
+
import frida
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
import struct
|
| 14 |
+
import time
|
| 15 |
+
import json
|
| 16 |
+
import ctypes
|
| 17 |
+
import subprocess
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 21 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# JavaScript hook script for Frida
|
| 24 |
+
FRIDA_SCRIPT = """
|
| 25 |
+
'use strict';
|
| 26 |
+
|
| 27 |
+
var MIN_SIZE = 100;
|
| 28 |
+
var decryptCallNum = 0;
|
| 29 |
+
|
| 30 |
+
// Hook BCryptDecrypt
|
| 31 |
+
var bcryptDecrypt = Module.findExportByName('bcrypt.dll', 'BCryptDecrypt');
|
| 32 |
+
if (bcryptDecrypt) {
|
| 33 |
+
Interceptor.attach(bcryptDecrypt, {
|
| 34 |
+
onEnter: function(args) {
|
| 35 |
+
this.pbInput = args[1];
|
| 36 |
+
this.cbInput = args[2].toInt32();
|
| 37 |
+
this.pbIV = args[4];
|
| 38 |
+
this.cbIV = args[5].toInt32();
|
| 39 |
+
this.pbOutput = args[6];
|
| 40 |
+
this.cbOutput = args[7].toInt32();
|
| 41 |
+
this.pcbResult = args[8];
|
| 42 |
+
this.dwFlags = args[9].toInt32();
|
| 43 |
+
this.callNum = decryptCallNum++;
|
| 44 |
+
},
|
| 45 |
+
onLeave: function(retval) {
|
| 46 |
+
var status = retval.toInt32();
|
| 47 |
+
var cbResult = 0;
|
| 48 |
+
try {
|
| 49 |
+
if (!this.pcbResult.isNull()) {
|
| 50 |
+
cbResult = this.pcbResult.readU32();
|
| 51 |
+
}
|
| 52 |
+
} catch(e) {}
|
| 53 |
+
|
| 54 |
+
var info = {
|
| 55 |
+
call: this.callNum,
|
| 56 |
+
status: status,
|
| 57 |
+
inputSize: this.cbInput,
|
| 58 |
+
ivSize: this.cbIV,
|
| 59 |
+
outputSize: cbResult,
|
| 60 |
+
flags: this.dwFlags
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
if (this.cbIV > 0 && !this.pbIV.isNull()) {
|
| 64 |
+
try {
|
| 65 |
+
info.iv = [];
|
| 66 |
+
var ivBuf = this.pbIV.readByteArray(this.cbIV);
|
| 67 |
+
var ivArr = new Uint8Array(ivBuf);
|
| 68 |
+
for (var k = 0; k < ivArr.length; k++) info.iv.push(ivArr[k]);
|
| 69 |
+
} catch(e) {}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
send({type: 'decrypt_call', info: info});
|
| 73 |
+
|
| 74 |
+
if (status === 0 && cbResult >= MIN_SIZE && !this.pbOutput.isNull()) {
|
| 75 |
+
try {
|
| 76 |
+
var data = this.pbOutput.readByteArray(cbResult);
|
| 77 |
+
send({type: 'decrypt_data', call: this.callNum, size: cbResult}, data);
|
| 78 |
+
} catch(e) {
|
| 79 |
+
send({type: 'log', msg: 'Read output failed: ' + e});
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
});
|
| 84 |
+
send({type: 'log', msg: 'Hooked BCryptDecrypt at ' + bcryptDecrypt});
|
| 85 |
+
} else {
|
| 86 |
+
send({type: 'log', msg: 'ERROR: BCryptDecrypt not found'});
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
// Hook BCryptGenerateSymmetricKey
|
| 90 |
+
var bcryptGenKey = Module.findExportByName('bcrypt.dll', 'BCryptGenerateSymmetricKey');
|
| 91 |
+
if (bcryptGenKey) {
|
| 92 |
+
Interceptor.attach(bcryptGenKey, {
|
| 93 |
+
onEnter: function(args) {
|
| 94 |
+
this.pbSecret = args[3];
|
| 95 |
+
this.cbSecret = args[4].toInt32();
|
| 96 |
+
},
|
| 97 |
+
onLeave: function(retval) {
|
| 98 |
+
if (retval.toInt32() === 0 && this.cbSecret > 0) {
|
| 99 |
+
try {
|
| 100 |
+
var keyBuf = this.pbSecret.readByteArray(this.cbSecret);
|
| 101 |
+
var keyArr = new Uint8Array(keyBuf);
|
| 102 |
+
var arr = [];
|
| 103 |
+
for (var i = 0; i < keyArr.length; i++) arr.push(keyArr[i]);
|
| 104 |
+
send({type: 'key_generated', size: this.cbSecret, key: arr});
|
| 105 |
+
} catch(e) {}
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
});
|
| 109 |
+
send({type: 'log', msg: 'Hooked BCryptGenerateSymmetricKey'});
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Hook BCryptSetProperty
|
| 113 |
+
var bcryptSetProp = Module.findExportByName('bcrypt.dll', 'BCryptSetProperty');
|
| 114 |
+
if (bcryptSetProp) {
|
| 115 |
+
Interceptor.attach(bcryptSetProp, {
|
| 116 |
+
onEnter: function(args) {
|
| 117 |
+
try {
|
| 118 |
+
var propName = args[1].readUtf16String();
|
| 119 |
+
var cbInput = args[3].toInt32();
|
| 120 |
+
var propValue = null;
|
| 121 |
+
if (cbInput > 0 && cbInput < 256 && !args[2].isNull()) {
|
| 122 |
+
try { propValue = args[2].readUtf16String(); } catch(e2) {}
|
| 123 |
+
}
|
| 124 |
+
send({type: 'set_property', name: propName, value: propValue, size: cbInput});
|
| 125 |
+
} catch(e) {}
|
| 126 |
+
}
|
| 127 |
+
});
|
| 128 |
+
send({type: 'log', msg: 'Hooked BCryptSetProperty'});
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
send({type: 'log', msg: 'All hooks installed. Ready.'});
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def create_loader_script():
|
| 136 |
+
"""Create a small Python script that loads oneocr.dll and creates a pipeline."""
|
| 137 |
+
script = r'''
|
| 138 |
+
import ctypes
|
| 139 |
+
from ctypes import c_int64, c_char_p, POINTER, byref
|
| 140 |
+
import time
|
| 141 |
+
import sys
|
| 142 |
+
import os
|
| 143 |
+
|
| 144 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 145 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 146 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 147 |
+
|
| 148 |
+
# Load DLLs
|
| 149 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 150 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 151 |
+
dll = ctypes.WinDLL(os.path.join(DLL_DIR, "oneocr.dll"))
|
| 152 |
+
|
| 153 |
+
# Setup function types
|
| 154 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 155 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 156 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, ctypes.c_char]
|
| 157 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 158 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 159 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 160 |
+
|
| 161 |
+
# Create init options
|
| 162 |
+
init_options = c_int64()
|
| 163 |
+
ret = dll.CreateOcrInitOptions(byref(init_options))
|
| 164 |
+
print(f"LOADER: CreateOcrInitOptions -> {ret}", flush=True)
|
| 165 |
+
assert ret == 0
|
| 166 |
+
|
| 167 |
+
ret = dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 168 |
+
print(f"LOADER: SetUseModelDelayLoad -> {ret}", flush=True)
|
| 169 |
+
assert ret == 0
|
| 170 |
+
|
| 171 |
+
# Create pipeline (this triggers decryption!)
|
| 172 |
+
pipeline = c_int64()
|
| 173 |
+
model_buf = ctypes.create_string_buffer(MODEL_PATH.encode())
|
| 174 |
+
key_buf = ctypes.create_string_buffer(KEY)
|
| 175 |
+
|
| 176 |
+
print("LOADER: Creating OCR pipeline (triggers decryption)...", flush=True)
|
| 177 |
+
ret = dll.CreateOcrPipeline(model_buf, key_buf, init_options, byref(pipeline))
|
| 178 |
+
print(f"LOADER: CreateOcrPipeline returned {ret}, pipeline={pipeline.value}", flush=True)
|
| 179 |
+
|
| 180 |
+
if ret != 0:
|
| 181 |
+
print(f"LOADER: ERROR - return code {ret}", flush=True)
|
| 182 |
+
sys.exit(1)
|
| 183 |
+
|
| 184 |
+
print("LOADER: Pipeline created successfully! Waiting...", flush=True)
|
| 185 |
+
time.sleep(5)
|
| 186 |
+
print("LOADER: Done.", flush=True)
|
| 187 |
+
'''
|
| 188 |
+
loader_path = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_loader.py")
|
| 189 |
+
loader_path.write_text(script)
|
| 190 |
+
return loader_path
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def on_message(message, data):
|
| 194 |
+
"""Handle messages from Frida script."""
|
| 195 |
+
if message['type'] == 'send':
|
| 196 |
+
payload = message['payload']
|
| 197 |
+
msg_type = payload.get('type', '')
|
| 198 |
+
|
| 199 |
+
if msg_type == 'log':
|
| 200 |
+
print(f"[FRIDA] {payload['msg']}")
|
| 201 |
+
|
| 202 |
+
elif msg_type == 'decrypt_call':
|
| 203 |
+
info = payload['info']
|
| 204 |
+
iv_hex = ''
|
| 205 |
+
if 'iv' in info:
|
| 206 |
+
iv_hex = bytes(info['iv']).hex()
|
| 207 |
+
print(f"[DECRYPT #{info['call']}] status={info['status']} "
|
| 208 |
+
f"in={info['inputSize']} out={info['outputSize']} "
|
| 209 |
+
f"iv_size={info['ivSize']} iv={iv_hex[:32]}... flags={info['flags']}")
|
| 210 |
+
|
| 211 |
+
elif msg_type == 'decrypt_data':
|
| 212 |
+
call_num = payload['call']
|
| 213 |
+
size = payload['size']
|
| 214 |
+
fname = OUTPUT_DIR / f"decrypt_{call_num}_{size}bytes.bin"
|
| 215 |
+
fname.write_bytes(data)
|
| 216 |
+
|
| 217 |
+
# Check first 4 bytes for magic number
|
| 218 |
+
magic = struct.unpack('<I', data[:4])[0] if len(data) >= 4 else 0
|
| 219 |
+
first_16 = data[:16].hex() if data else ''
|
| 220 |
+
print(f" -> Saved {fname.name} | magic={magic} | first_16={first_16}")
|
| 221 |
+
|
| 222 |
+
if magic == 1:
|
| 223 |
+
print(f" *** MAGIC NUMBER == 1 FOUND! This is the decrypted model container! ***")
|
| 224 |
+
|
| 225 |
+
elif msg_type == 'key_generated':
|
| 226 |
+
key_bytes = bytes(payload['key'])
|
| 227 |
+
print(f"[KEY] size={payload['size']} key={key_bytes}")
|
| 228 |
+
try:
|
| 229 |
+
print(f" ASCII: {key_bytes.decode('ascii', errors='replace')}")
|
| 230 |
+
except:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
elif msg_type == 'set_property':
|
| 234 |
+
print(f"[PROPERTY] {payload['name']} = {payload['value']} (size={payload['size']})")
|
| 235 |
+
|
| 236 |
+
elif msg_type == 'uncompress':
|
| 237 |
+
print(f"[UNCOMPRESS] sourceLen={payload['sourceLen']} -> destLen={payload['destLen']}")
|
| 238 |
+
|
| 239 |
+
elif msg_type == 'uncompress_data':
|
| 240 |
+
size = payload['size']
|
| 241 |
+
fname = OUTPUT_DIR / f"uncompressed_{size}bytes.bin"
|
| 242 |
+
fname.write_bytes(data)
|
| 243 |
+
first_32 = data[:32].hex() if data else ''
|
| 244 |
+
print(f" -> Saved {fname.name} | first_32={first_32}")
|
| 245 |
+
|
| 246 |
+
elif msg_type == 'ort_export':
|
| 247 |
+
print(f"[ORT] {payload['name']} @ {payload['addr']}")
|
| 248 |
+
|
| 249 |
+
else:
|
| 250 |
+
print(f"[MSG] {payload}")
|
| 251 |
+
|
| 252 |
+
elif message['type'] == 'error':
|
| 253 |
+
print(f"[FRIDA ERROR] {message['description']}")
|
| 254 |
+
if 'stack' in message:
|
| 255 |
+
print(message['stack'])
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def main():
|
| 259 |
+
print("=" * 70)
|
| 260 |
+
print("FRIDA HOOKING: Intercepting OneOCR model decryption")
|
| 261 |
+
print("=" * 70)
|
| 262 |
+
|
| 263 |
+
# Create loader script
|
| 264 |
+
loader_path = create_loader_script()
|
| 265 |
+
print(f"Loader script: {loader_path}")
|
| 266 |
+
|
| 267 |
+
# Find Python executable in venv
|
| 268 |
+
venv_python = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\.venv\Scripts\python.exe")
|
| 269 |
+
if not venv_python.exists():
|
| 270 |
+
print("ERROR: Python venv not found")
|
| 271 |
+
sys.exit(1)
|
| 272 |
+
|
| 273 |
+
# Spawn the loader process
|
| 274 |
+
print(f"Spawning: {venv_python} {loader_path}")
|
| 275 |
+
pid = frida.spawn([str(venv_python), str(loader_path)])
|
| 276 |
+
print(f"Process spawned, PID={pid}")
|
| 277 |
+
|
| 278 |
+
session = frida.attach(pid)
|
| 279 |
+
print("Attached to process")
|
| 280 |
+
|
| 281 |
+
script = session.create_script(FRIDA_SCRIPT)
|
| 282 |
+
script.on('message', on_message)
|
| 283 |
+
script.load()
|
| 284 |
+
print("Script loaded, resuming process...")
|
| 285 |
+
|
| 286 |
+
frida.resume(pid)
|
| 287 |
+
|
| 288 |
+
# Wait for the process to finish
|
| 289 |
+
print("Waiting for process to complete...")
|
| 290 |
+
try:
|
| 291 |
+
# Wait up to 60 seconds
|
| 292 |
+
for _ in range(120):
|
| 293 |
+
time.sleep(0.5)
|
| 294 |
+
try:
|
| 295 |
+
# Check if process is still alive
|
| 296 |
+
session.is_detached
|
| 297 |
+
except:
|
| 298 |
+
break
|
| 299 |
+
except KeyboardInterrupt:
|
| 300 |
+
print("\nInterrupted by user")
|
| 301 |
+
except frida.InvalidOperationError:
|
| 302 |
+
print("Process terminated")
|
| 303 |
+
|
| 304 |
+
# Summary
|
| 305 |
+
print()
|
| 306 |
+
print("=" * 70)
|
| 307 |
+
print("RESULTS")
|
| 308 |
+
print("=" * 70)
|
| 309 |
+
|
| 310 |
+
if OUTPUT_DIR.exists():
|
| 311 |
+
files = sorted(OUTPUT_DIR.iterdir())
|
| 312 |
+
if files:
|
| 313 |
+
print(f"Dumped {len(files)} files:")
|
| 314 |
+
for f in files:
|
| 315 |
+
size = f.stat().st_size
|
| 316 |
+
print(f" {f.name}: {size:,} bytes")
|
| 317 |
+
if size >= 4:
|
| 318 |
+
header = open(f, 'rb').read(16)
|
| 319 |
+
magic = struct.unpack('<I', header[:4])[0]
|
| 320 |
+
print(f" magic={magic}, first_16={header.hex()}")
|
| 321 |
+
else:
|
| 322 |
+
print("No files dumped.")
|
| 323 |
+
|
| 324 |
+
print("\nDone!")
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
if __name__ == '__main__':
|
| 328 |
+
main()
|
_archive/attempts/frida_loader.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import ctypes
|
| 3 |
+
from ctypes import c_int64, c_char_p, POINTER, byref
|
| 4 |
+
import time
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 9 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 10 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 11 |
+
|
| 12 |
+
# Load DLLs
|
| 13 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 14 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 15 |
+
dll = ctypes.WinDLL(os.path.join(DLL_DIR, "oneocr.dll"))
|
| 16 |
+
|
| 17 |
+
# Setup function types
|
| 18 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 19 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 20 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, ctypes.c_char]
|
| 21 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 22 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 23 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 24 |
+
|
| 25 |
+
# Create init options
|
| 26 |
+
init_options = c_int64()
|
| 27 |
+
ret = dll.CreateOcrInitOptions(byref(init_options))
|
| 28 |
+
print(f"LOADER: CreateOcrInitOptions -> {ret}", flush=True)
|
| 29 |
+
assert ret == 0
|
| 30 |
+
|
| 31 |
+
ret = dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 32 |
+
print(f"LOADER: SetUseModelDelayLoad -> {ret}", flush=True)
|
| 33 |
+
assert ret == 0
|
| 34 |
+
|
| 35 |
+
# Create pipeline (this triggers decryption!)
|
| 36 |
+
pipeline = c_int64()
|
| 37 |
+
model_buf = ctypes.create_string_buffer(MODEL_PATH.encode())
|
| 38 |
+
key_buf = ctypes.create_string_buffer(KEY)
|
| 39 |
+
|
| 40 |
+
print("LOADER: Creating OCR pipeline (triggers decryption)...", flush=True)
|
| 41 |
+
ret = dll.CreateOcrPipeline(model_buf, key_buf, init_options, byref(pipeline))
|
| 42 |
+
print(f"LOADER: CreateOcrPipeline returned {ret}, pipeline={pipeline.value}", flush=True)
|
| 43 |
+
|
| 44 |
+
if ret != 0:
|
| 45 |
+
print(f"LOADER: ERROR - return code {ret}", flush=True)
|
| 46 |
+
sys.exit(1)
|
| 47 |
+
|
| 48 |
+
print("LOADER: Pipeline created successfully! Waiting...", flush=True)
|
| 49 |
+
time.sleep(5)
|
| 50 |
+
print("LOADER: Done.", flush=True)
|
_archive/attempts/peek_header.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import struct
|
| 2 |
+
|
| 3 |
+
filepath = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
|
| 4 |
+
|
| 5 |
+
with open(filepath, "rb") as f:
|
| 6 |
+
data = f.read(23000) # read a bit more than 22636
|
| 7 |
+
f.seek(0, 2)
|
| 8 |
+
filesize = f.tell()
|
| 9 |
+
|
| 10 |
+
print(f"File size: {filesize} bytes ({filesize/1024/1024:.2f} MB)")
|
| 11 |
+
print()
|
| 12 |
+
|
| 13 |
+
# Hex dump first 512 bytes
|
| 14 |
+
print("=== First 512 bytes hex dump ===")
|
| 15 |
+
for i in range(0, 512, 16):
|
| 16 |
+
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
|
| 17 |
+
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
|
| 18 |
+
print(f"{i:08x}: {hex_part:<48s} {ascii_part}")
|
| 19 |
+
|
| 20 |
+
print()
|
| 21 |
+
print("=== uint32 LE values at key offsets ===")
|
| 22 |
+
for off in range(0, 64, 4):
|
| 23 |
+
val = struct.unpack_from("<I", data, off)[0]
|
| 24 |
+
print(f" offset {off:4d} (0x{off:04x}): {val:12d} (0x{val:08x})")
|
| 25 |
+
|
| 26 |
+
print()
|
| 27 |
+
print("=== Check around offset 22636 (header size?) ===")
|
| 28 |
+
off = 22636
|
| 29 |
+
for i in range(off - 32, off + 64, 16):
|
| 30 |
+
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
|
| 31 |
+
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
|
| 32 |
+
print(f"{i:08x}: {hex_part:<48s} {ascii_part}")
|
| 33 |
+
|
| 34 |
+
print()
|
| 35 |
+
print("=== Entropy analysis of header vs body ===")
|
| 36 |
+
from collections import Counter
|
| 37 |
+
header = data[:22636]
|
| 38 |
+
body_sample = data[22636:22636+4096]
|
| 39 |
+
h_counter = Counter(header)
|
| 40 |
+
b_counter = Counter(body_sample)
|
| 41 |
+
print(f" Header unique bytes: {len(h_counter)}/256")
|
| 42 |
+
print(f" Body sample unique bytes: {len(b_counter)}/256")
|
| 43 |
+
|
| 44 |
+
# Check for null bytes in header
|
| 45 |
+
null_count = header.count(0)
|
| 46 |
+
print(f" Header null bytes: {null_count}/{len(header)} ({100*null_count/len(header):.1f}%)")
|
| 47 |
+
|
| 48 |
+
# Look for patterns in header
|
| 49 |
+
print()
|
| 50 |
+
print("=== Looking for potential sub-structures in header ===")
|
| 51 |
+
# Check if there are recognizable strings
|
| 52 |
+
import re
|
| 53 |
+
strings = re.findall(b'[\x20-\x7e]{4,}', header)
|
| 54 |
+
if strings:
|
| 55 |
+
print(" ASCII strings found in header:")
|
| 56 |
+
for s in strings[:30]:
|
| 57 |
+
print(f" {s.decode('ascii', errors='replace')}")
|
| 58 |
+
else:
|
| 59 |
+
print(" No ASCII strings >= 4 chars found in header")
|
| 60 |
+
|
| 61 |
+
# Check for potential magic numbers
|
| 62 |
+
print()
|
| 63 |
+
print("=== Magic number checks at offset 0 ===")
|
| 64 |
+
print(f" Bytes 0-3: {data[0:4].hex()}")
|
| 65 |
+
print(f" Bytes 0-7: {data[0:8].hex()}")
|
| 66 |
+
print(f" As string: {data[0:8]}")
|
| 67 |
+
|
| 68 |
+
# Look for repeating 4-byte patterns
|
| 69 |
+
print()
|
| 70 |
+
print("=== Byte frequency in first 64 bytes ===")
|
| 71 |
+
for i in range(64):
|
| 72 |
+
if i % 16 == 0:
|
| 73 |
+
print(f" {i:3d}: ", end="")
|
| 74 |
+
print(f"{data[i]:3d}", end=" ")
|
| 75 |
+
if i % 16 == 15:
|
| 76 |
+
print()
|
| 77 |
+
|
| 78 |
+
# Check if header has structure - look for uint32 values that could be offsets/sizes
|
| 79 |
+
print()
|
| 80 |
+
print("=== Potential offset/size table at start ===")
|
| 81 |
+
for i in range(0, min(256, len(header)), 4):
|
| 82 |
+
val = struct.unpack_from("<I", data, i)[0]
|
| 83 |
+
if 0 < val < filesize:
|
| 84 |
+
print(f" offset {i}: uint32={val} (could be offset/size, {val/1024:.1f}KB)")
|
| 85 |
+
|
| 86 |
+
# Check byte patterns for IV detection
|
| 87 |
+
print()
|
| 88 |
+
print("=== 16-byte blocks that could be IV ===")
|
| 89 |
+
for start in [4, 8, 12, 16, 20]:
|
| 90 |
+
block = data[start:start+16]
|
| 91 |
+
unique = len(set(block))
|
| 92 |
+
print(f" offset {start:3d}: {block.hex()} (unique bytes: {unique}/16)")
|
_archive/attempts/static_decrypt.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Static decryptor for OneOCR .onemodel files using BCrypt CNG API.
|
| 3 |
+
Finds chunk boundaries by re-encrypting known plaintext patterns.
|
| 4 |
+
Works on Windows only (BCrypt CNG). For Linux, use the hook-based approach.
|
| 5 |
+
|
| 6 |
+
Usage: python static_decrypt.py [model_path] [-o output_dir]
|
| 7 |
+
"""
|
| 8 |
+
import ctypes
|
| 9 |
+
import ctypes.wintypes as wt
|
| 10 |
+
from ctypes import c_void_p, c_ulong, POINTER, byref
|
| 11 |
+
import struct
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
# ═══════════════════════════════════════════════════════════════
|
| 17 |
+
# CRYPTO PARAMETERS (discovered via IAT hook interception)
|
| 18 |
+
# ═══════════════════════════════════════════════════════════════
|
| 19 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 20 |
+
IV = b"Copyright @ OneO"
|
| 21 |
+
CONTAINER_HEADER = bytes.fromhex("4a1a082b25000000")
|
| 22 |
+
ONNX_VALID_FIELDS = {1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 20}
|
| 23 |
+
|
| 24 |
+
# BCrypt constants
|
| 25 |
+
BCRYPT_AES = "AES\0".encode('utf-16-le')
|
| 26 |
+
BCRYPT_CHAINING_MODE = "ChainingMode\0".encode('utf-16-le')
|
| 27 |
+
BCRYPT_CHAIN_MODE_CFB = "ChainingModeCFB\0".encode('utf-16-le')
|
| 28 |
+
|
| 29 |
+
bcrypt = ctypes.windll.bcrypt
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class BCRYPT_KEY_DATA_BLOB_HEADER(ctypes.Structure):
|
| 33 |
+
_fields_ = [
|
| 34 |
+
("dwMagic", c_ulong),
|
| 35 |
+
("dwVersion", c_ulong),
|
| 36 |
+
("cbKeyData", c_ulong),
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def setup_bcrypt():
|
| 41 |
+
hAlg = c_void_p()
|
| 42 |
+
assert bcrypt.BCryptOpenAlgorithmProvider(byref(hAlg), BCRYPT_AES, None, 0) == 0
|
| 43 |
+
assert bcrypt.BCryptSetProperty(hAlg, BCRYPT_CHAINING_MODE,
|
| 44 |
+
BCRYPT_CHAIN_MODE_CFB, len(BCRYPT_CHAIN_MODE_CFB), 0) == 0
|
| 45 |
+
header = BCRYPT_KEY_DATA_BLOB_HEADER(dwMagic=0x4d42444b, dwVersion=1, cbKeyData=len(KEY))
|
| 46 |
+
blob = bytes(header) + KEY
|
| 47 |
+
hKey = c_void_p()
|
| 48 |
+
assert bcrypt.BCryptGenerateSymmetricKey(hAlg, byref(hKey), None, 0, blob, len(blob), 0) == 0
|
| 49 |
+
return hAlg, hKey
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def bcrypt_op(hKey, data, encrypt=False):
|
| 53 |
+
"""Encrypt or decrypt data using BCrypt AES-CFB with fresh IV."""
|
| 54 |
+
iv = bytearray(IV)
|
| 55 |
+
func = bcrypt.BCryptEncrypt if encrypt else bcrypt.BCryptDecrypt
|
| 56 |
+
result_size = c_ulong(0)
|
| 57 |
+
func(hKey, data, len(data), None, None, 0, None, 0, byref(result_size), 0)
|
| 58 |
+
output = (ctypes.c_ubyte * result_size.value)()
|
| 59 |
+
actual = c_ulong(0)
|
| 60 |
+
status = func(hKey, data, len(data), None,
|
| 61 |
+
(ctypes.c_ubyte * len(iv))(*iv), len(iv),
|
| 62 |
+
output, result_size.value, byref(actual), 0)
|
| 63 |
+
assert status == 0, f"BCrypt op failed: {status:#x}"
|
| 64 |
+
return bytes(output[:actual.value])
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def read_varint(data, pos):
|
| 68 |
+
val = 0; shift = 0
|
| 69 |
+
while pos < len(data):
|
| 70 |
+
b = data[pos]; pos += 1
|
| 71 |
+
val |= (b & 0x7f) << shift
|
| 72 |
+
if not (b & 0x80): break
|
| 73 |
+
shift += 7
|
| 74 |
+
return val, pos
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def measure_onnx(data):
|
| 78 |
+
pos = 0; last = 0
|
| 79 |
+
while pos < len(data):
|
| 80 |
+
start = pos
|
| 81 |
+
tag, pos = read_varint(data, pos)
|
| 82 |
+
if pos > len(data): break
|
| 83 |
+
fn = tag >> 3; wt = tag & 7
|
| 84 |
+
if fn not in ONNX_VALID_FIELDS: return start
|
| 85 |
+
if wt == 0: _, pos = read_varint(data, pos)
|
| 86 |
+
elif wt == 1: pos += 8
|
| 87 |
+
elif wt == 2: l, pos = read_varint(data, pos); pos += l
|
| 88 |
+
elif wt == 5: pos += 4
|
| 89 |
+
else: return start
|
| 90 |
+
if pos > len(data): return start
|
| 91 |
+
last = pos
|
| 92 |
+
return last
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def main():
|
| 96 |
+
import argparse
|
| 97 |
+
parser = argparse.ArgumentParser(description="OneOCR .onemodel decryptor (Windows BCrypt)")
|
| 98 |
+
parser.add_argument("model_path", nargs="?", default="ocr_data/oneocr.onemodel")
|
| 99 |
+
parser.add_argument("-o", "--output", default="onnx_models_static")
|
| 100 |
+
args = parser.parse_args()
|
| 101 |
+
|
| 102 |
+
model_path = Path(args.model_path)
|
| 103 |
+
output_dir = Path(args.output)
|
| 104 |
+
output_dir.mkdir(exist_ok=True, parents=True)
|
| 105 |
+
for old in output_dir.glob("*"): old.unlink()
|
| 106 |
+
|
| 107 |
+
data = model_path.read_bytes()
|
| 108 |
+
print(f"{'='*70}")
|
| 109 |
+
print(f"OneOCR Static Decryptor (BCrypt CNG)")
|
| 110 |
+
print(f"{'='*70}")
|
| 111 |
+
print(f"File: {model_path} ({len(data):,} bytes)")
|
| 112 |
+
|
| 113 |
+
hAlg, hKey = setup_bcrypt()
|
| 114 |
+
print(f"AES-256-CFB initialized")
|
| 115 |
+
|
| 116 |
+
# Step 1: Decrypt DX index (offset 24, size 22624)
|
| 117 |
+
dx_offset = 24
|
| 118 |
+
dx_size = 22624
|
| 119 |
+
dx_dec = bcrypt_op(hKey, data[dx_offset:dx_offset + dx_size])
|
| 120 |
+
print(f"\nDX index: starts with {dx_dec[:2].hex()}")
|
| 121 |
+
assert dx_dec[:2] == b'DX', f"DX header not found! Got: {dx_dec[:8].hex()}"
|
| 122 |
+
(output_dir / "dx_index.bin").write_bytes(dx_dec)
|
| 123 |
+
|
| 124 |
+
# Step 2: Parse DX to find embedded chunks
|
| 125 |
+
# DX contains sub-chunks that need independent decryption
|
| 126 |
+
# We'll also find main payload chunks by scanning the file
|
| 127 |
+
|
| 128 |
+
# The DX contains a list of uint64 values that might be chunk sizes/offsets
|
| 129 |
+
dx_values = []
|
| 130 |
+
for i in range(0, len(dx_dec) - 7, 8):
|
| 131 |
+
v = struct.unpack_from('<Q', dx_dec, i)[0]
|
| 132 |
+
if v > 0 and v < len(data):
|
| 133 |
+
dx_values.append((i, v))
|
| 134 |
+
|
| 135 |
+
# Step 3: Try to decrypt every possible chunk in the payload area
|
| 136 |
+
# Payload starts after DX (offset 22648) + 36 bytes gap = 22684
|
| 137 |
+
payload_start = dx_offset + dx_size + 36
|
| 138 |
+
|
| 139 |
+
print(f"\n--- Scanning payload for encrypted chunks ---")
|
| 140 |
+
print(f"Payload starts at offset {payload_start}")
|
| 141 |
+
|
| 142 |
+
# Strategy: try decrypting at current offset, check if result starts
|
| 143 |
+
# with container magic. If yes, extract chunk, determine its size
|
| 144 |
+
# from the DX index or by scanning forward.
|
| 145 |
+
|
| 146 |
+
# Known chunk sizes from the DX index analysis:
|
| 147 |
+
# We know the DX has entries like 11943, 11903, 11927 etc.
|
| 148 |
+
# And the main payload has large ONNX models.
|
| 149 |
+
|
| 150 |
+
# Let's try a different approach: scan the encrypted file for positions
|
| 151 |
+
# where decryption produces valid container magic
|
| 152 |
+
|
| 153 |
+
print(f"\nSearching for chunk boundaries by trial decryption...")
|
| 154 |
+
|
| 155 |
+
# The container magic `4a1a082b25000000` after decryption = specific encrypted pattern
|
| 156 |
+
# Compute what the container magic encrypts TO:
|
| 157 |
+
magic_encrypted = bcrypt_op(hKey, CONTAINER_HEADER, encrypt=True)
|
| 158 |
+
print(f"Container magic encrypted: {magic_encrypted.hex()}")
|
| 159 |
+
|
| 160 |
+
# Search for this pattern in the payload area
|
| 161 |
+
chunk_starts = []
|
| 162 |
+
search_start = payload_start
|
| 163 |
+
|
| 164 |
+
# Also check DX sub-chunks
|
| 165 |
+
# First, find container magic encryptions within the DX encrypted data
|
| 166 |
+
|
| 167 |
+
while search_start < len(data) - 16:
|
| 168 |
+
idx = data.find(magic_encrypted[:8], search_start)
|
| 169 |
+
if idx < 0:
|
| 170 |
+
break
|
| 171 |
+
# Verify by decrypting 16 bytes
|
| 172 |
+
test = bcrypt_op(hKey, data[idx:idx+16])
|
| 173 |
+
if test[:8] == CONTAINER_HEADER:
|
| 174 |
+
chunk_starts.append(idx)
|
| 175 |
+
search_start = idx + 1
|
| 176 |
+
else:
|
| 177 |
+
search_start = idx + 1
|
| 178 |
+
|
| 179 |
+
print(f"Found {len(chunk_starts)} potential chunk starts")
|
| 180 |
+
|
| 181 |
+
if not chunk_starts:
|
| 182 |
+
# Fallback: just try sequential decryption
|
| 183 |
+
print("No chunk starts found via magic pattern. Trying sequential...")
|
| 184 |
+
# Try decrypting from payload_start with large block sizes
|
| 185 |
+
remaining = len(data) - payload_start
|
| 186 |
+
dec = bcrypt_op(hKey, data[payload_start:payload_start + remaining])
|
| 187 |
+
|
| 188 |
+
# Find container magic in decrypted data
|
| 189 |
+
pos = 0
|
| 190 |
+
chunks_data = []
|
| 191 |
+
while True:
|
| 192 |
+
idx = dec.find(CONTAINER_HEADER, pos)
|
| 193 |
+
if idx < 0:
|
| 194 |
+
# Handle remaining data
|
| 195 |
+
if pos < len(dec):
|
| 196 |
+
chunks_data.append(dec[pos:])
|
| 197 |
+
break
|
| 198 |
+
if idx > pos:
|
| 199 |
+
chunks_data.append(dec[pos:idx])
|
| 200 |
+
pos = idx # Will be split on next iteration
|
| 201 |
+
# Find next occurrence
|
| 202 |
+
next_idx = dec.find(CONTAINER_HEADER, pos + 8)
|
| 203 |
+
if next_idx < 0:
|
| 204 |
+
chunks_data.append(dec[pos:])
|
| 205 |
+
break
|
| 206 |
+
chunks_data.append(dec[pos:next_idx])
|
| 207 |
+
pos = next_idx
|
| 208 |
+
|
| 209 |
+
print(f"Found {len(chunks_data)} chunks in sequential decryption")
|
| 210 |
+
else:
|
| 211 |
+
# Decrypt each chunk
|
| 212 |
+
chunk_starts.sort()
|
| 213 |
+
chunks_data = []
|
| 214 |
+
for i, start in enumerate(chunk_starts):
|
| 215 |
+
end = chunk_starts[i + 1] if i + 1 < len(chunk_starts) else len(data)
|
| 216 |
+
encrypted = data[start:end]
|
| 217 |
+
try:
|
| 218 |
+
dec = bcrypt_op(hKey, encrypted)
|
| 219 |
+
chunks_data.append(dec)
|
| 220 |
+
except:
|
| 221 |
+
pass
|
| 222 |
+
|
| 223 |
+
# Extract models from chunks
|
| 224 |
+
print(f"\n--- Extracting ONNX models ---")
|
| 225 |
+
models = []
|
| 226 |
+
data_files = []
|
| 227 |
+
|
| 228 |
+
for chunk in chunks_data:
|
| 229 |
+
if chunk[:8] == CONTAINER_HEADER:
|
| 230 |
+
payload = chunk[8:]
|
| 231 |
+
else:
|
| 232 |
+
payload = chunk
|
| 233 |
+
|
| 234 |
+
if len(payload) >= 2 and payload[0] == 0x08 and 1 <= payload[1] <= 12:
|
| 235 |
+
valid_len = measure_onnx(payload)
|
| 236 |
+
onnx_data = payload[:valid_len]
|
| 237 |
+
if valid_len < 100: # Too small to be a real model
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
producer = "unknown"
|
| 241 |
+
if b"PyTorch" in payload[:100]: producer = "pytorch"
|
| 242 |
+
elif b"onnx.quantize" in payload[:100]: producer = "onnx_quantize"
|
| 243 |
+
elif b"pytorch" in payload[:100]: producer = "pytorch_small"
|
| 244 |
+
|
| 245 |
+
ir = payload[1]
|
| 246 |
+
idx = len(models)
|
| 247 |
+
fname = f"model_{idx:02d}_ir{ir}_{producer}_{valid_len//1024}KB.onnx"
|
| 248 |
+
(output_dir / fname).write_bytes(onnx_data)
|
| 249 |
+
models.append({'name': fname, 'size': valid_len})
|
| 250 |
+
print(f" ONNX: {fname} ({valid_len:,} bytes)")
|
| 251 |
+
elif len(payload) > 100:
|
| 252 |
+
preview = payload[:30].decode('utf-8', errors='replace')
|
| 253 |
+
idx = len(data_files)
|
| 254 |
+
fname = f"data_{idx:02d}_{len(payload)}B.bin"
|
| 255 |
+
(output_dir / fname).write_bytes(payload)
|
| 256 |
+
data_files.append({'name': fname, 'size': len(payload)})
|
| 257 |
+
print(f" Data: {fname} ({len(payload):,} bytes) {preview[:30]!r}")
|
| 258 |
+
|
| 259 |
+
# Summary
|
| 260 |
+
print(f"\n{'='*70}")
|
| 261 |
+
print(f"EXTRACTION COMPLETE")
|
| 262 |
+
print(f"{'='*70}")
|
| 263 |
+
print(f"ONNX models: {len(models)}")
|
| 264 |
+
print(f"Data files: {len(data_files)}")
|
| 265 |
+
if models:
|
| 266 |
+
total = sum(m['size'] for m in models)
|
| 267 |
+
print(f"Total ONNX: {total:,} bytes ({total/1024/1024:.1f} MB)")
|
| 268 |
+
|
| 269 |
+
# Verify
|
| 270 |
+
try:
|
| 271 |
+
import onnx
|
| 272 |
+
ok = sum(1 for m in models if not _try_load(onnx, output_dir / m['name']))
|
| 273 |
+
ok = 0
|
| 274 |
+
for m in models:
|
| 275 |
+
try:
|
| 276 |
+
onnx.load(str(output_dir / m['name']))
|
| 277 |
+
ok += 1
|
| 278 |
+
except:
|
| 279 |
+
pass
|
| 280 |
+
print(f"Verified with onnx.load: {ok}/{len(models)}")
|
| 281 |
+
except ImportError:
|
| 282 |
+
pass
|
| 283 |
+
|
| 284 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 285 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
if __name__ == "__main__":
|
| 289 |
+
main()
|
_archive/attempts/verify_bcrypt.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Verify BCrypt CNG setup - test raw key + different CFB segment sizes."""
|
| 2 |
+
import ctypes
|
| 3 |
+
from ctypes import c_void_p, c_ulong, byref
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import struct
|
| 6 |
+
|
| 7 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 8 |
+
IV = b"Copyright @ OneO"
|
| 9 |
+
|
| 10 |
+
bcrypt = ctypes.windll.bcrypt
|
| 11 |
+
|
| 12 |
+
# Known plaintext (DX header from hook dump)
|
| 13 |
+
dx_plain = bytes.fromhex("44580000000000005c58000000000000")
|
| 14 |
+
# Known ciphertext (from file at offset 24, first 16 bytes)
|
| 15 |
+
file_ct = bytes.fromhex("2e0c10c7c967f66b6d03821271115ad6")
|
| 16 |
+
|
| 17 |
+
# Full file data
|
| 18 |
+
file_data = Path("ocr_data/oneocr.onemodel").read_bytes()
|
| 19 |
+
hook_dx = Path("frida_dump/decrypt_1_in22624_out22624.bin").read_bytes()
|
| 20 |
+
|
| 21 |
+
print("=" * 70)
|
| 22 |
+
print("BCrypt CNG CFB Segment Size Test")
|
| 23 |
+
print("=" * 70)
|
| 24 |
+
print(f"KEY: {KEY}")
|
| 25 |
+
print(f"IV: {IV}")
|
| 26 |
+
print(f"Expected PT: {dx_plain.hex()}")
|
| 27 |
+
print(f"Expected CT: {file_ct.hex()}")
|
| 28 |
+
print()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_cfb(msg_block_length, use_blob=False):
|
| 32 |
+
"""Test BCrypt AES-CFB with given MessageBlockLength."""
|
| 33 |
+
tag = "MBL={}".format("default" if msg_block_length is None else msg_block_length)
|
| 34 |
+
if use_blob:
|
| 35 |
+
tag += "+blob"
|
| 36 |
+
|
| 37 |
+
hAlg = c_void_p()
|
| 38 |
+
status = bcrypt.BCryptOpenAlgorithmProvider(
|
| 39 |
+
byref(hAlg), "AES\0".encode("utf-16-le"), None, 0
|
| 40 |
+
)
|
| 41 |
+
if status != 0:
|
| 42 |
+
print(" [{}] OpenAlgorithm failed: {:#010x}".format(tag, status))
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
mode = "ChainingModeCFB\0".encode("utf-16-le")
|
| 46 |
+
status = bcrypt.BCryptSetProperty(
|
| 47 |
+
hAlg, "ChainingMode\0".encode("utf-16-le"), mode, len(mode), 0
|
| 48 |
+
)
|
| 49 |
+
if status != 0:
|
| 50 |
+
print(" [{}] SetProperty ChainingMode failed: {:#010x}".format(tag, status))
|
| 51 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
if msg_block_length is not None:
|
| 55 |
+
mbl = c_ulong(msg_block_length)
|
| 56 |
+
status = bcrypt.BCryptSetProperty(
|
| 57 |
+
hAlg, "MessageBlockLength\0".encode("utf-16-le"),
|
| 58 |
+
byref(mbl), 4, 0
|
| 59 |
+
)
|
| 60 |
+
if status != 0:
|
| 61 |
+
print(" [{}] SetProperty MBL={} failed: {:#010x}".format(tag, msg_block_length, status))
|
| 62 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
hKey = c_void_p()
|
| 66 |
+
if use_blob:
|
| 67 |
+
blob = struct.pack('<III', 0x4d42444b, 1, len(KEY)) + KEY
|
| 68 |
+
status = bcrypt.BCryptGenerateSymmetricKey(
|
| 69 |
+
hAlg, byref(hKey), None, 0, blob, len(blob), 0
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
status = bcrypt.BCryptGenerateSymmetricKey(
|
| 73 |
+
hAlg, byref(hKey), None, 0, KEY, len(KEY), 0
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if status != 0:
|
| 77 |
+
print(" [{}] GenerateSymmetricKey failed: {:#010x}".format(tag, status))
|
| 78 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
# Encrypt test
|
| 82 |
+
iv_enc = bytearray(IV)
|
| 83 |
+
result_size = c_ulong(0)
|
| 84 |
+
bcrypt.BCryptEncrypt(hKey, dx_plain, len(dx_plain), None, None, 0,
|
| 85 |
+
None, 0, byref(result_size), 0)
|
| 86 |
+
output = (ctypes.c_ubyte * result_size.value)()
|
| 87 |
+
actual = c_ulong(0)
|
| 88 |
+
iv_buf = (ctypes.c_ubyte * 16)(*iv_enc)
|
| 89 |
+
bcrypt.BCryptEncrypt(hKey, dx_plain, len(dx_plain), None,
|
| 90 |
+
iv_buf, 16, output, result_size.value, byref(actual), 0)
|
| 91 |
+
our_ct = bytes(output[:actual.value])
|
| 92 |
+
ct_match = our_ct[:16] == file_ct
|
| 93 |
+
|
| 94 |
+
# Decrypt test (fresh key)
|
| 95 |
+
hKey2 = c_void_p()
|
| 96 |
+
if use_blob:
|
| 97 |
+
blob = struct.pack('<III', 0x4d42444b, 1, len(KEY)) + KEY
|
| 98 |
+
bcrypt.BCryptGenerateSymmetricKey(hAlg, byref(hKey2), None, 0, blob, len(blob), 0)
|
| 99 |
+
else:
|
| 100 |
+
bcrypt.BCryptGenerateSymmetricKey(hAlg, byref(hKey2), None, 0, KEY, len(KEY), 0)
|
| 101 |
+
|
| 102 |
+
iv_dec = bytearray(IV)
|
| 103 |
+
encrypted_chunk = file_data[24:24 + 32]
|
| 104 |
+
result_size = c_ulong(0)
|
| 105 |
+
bcrypt.BCryptDecrypt(hKey2, encrypted_chunk, len(encrypted_chunk), None, None, 0,
|
| 106 |
+
None, 0, byref(result_size), 0)
|
| 107 |
+
output2 = (ctypes.c_ubyte * result_size.value)()
|
| 108 |
+
iv_buf2 = (ctypes.c_ubyte * 16)(*iv_dec)
|
| 109 |
+
status = bcrypt.BCryptDecrypt(hKey2, encrypted_chunk, len(encrypted_chunk), None,
|
| 110 |
+
iv_buf2, 16, output2, result_size.value, byref(actual), 0)
|
| 111 |
+
our_pt = bytes(output2[:actual.value])
|
| 112 |
+
pt_match = our_pt[:2] == b"DX"
|
| 113 |
+
|
| 114 |
+
mark = "*** MATCH! ***" if ct_match else ""
|
| 115 |
+
print(" [{}] Enc->CT: {} {} {}".format(tag, our_ct[:16].hex(), "OK" if ct_match else "FAIL", mark))
|
| 116 |
+
print(" [{}] Dec->PT: {} {}".format(tag, our_pt[:16].hex(), "OK DX" if pt_match else "FAIL"))
|
| 117 |
+
|
| 118 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 119 |
+
bcrypt.BCryptDestroyKey(hKey2)
|
| 120 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 121 |
+
return ct_match
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
print("--- Raw key (correct for BCryptGenerateSymmetricKey) ---")
|
| 125 |
+
test_cfb(None)
|
| 126 |
+
test_cfb(1)
|
| 127 |
+
test_cfb(16)
|
| 128 |
+
print()
|
| 129 |
+
|
| 130 |
+
print("--- Blob key (has 12-byte header prepended - wrong) ---")
|
| 131 |
+
test_cfb(None, use_blob=True)
|
| 132 |
+
test_cfb(1, use_blob=True)
|
| 133 |
+
test_cfb(16, use_blob=True)
|
| 134 |
+
print()
|
| 135 |
+
|
| 136 |
+
print("--- BCryptImportKey with BCRYPT_KEY_DATA_BLOB ---")
|
| 137 |
+
for mbl in [None, 1, 16]:
|
| 138 |
+
tag = "Import+MBL={}".format("default" if mbl is None else mbl)
|
| 139 |
+
hAlg = c_void_p()
|
| 140 |
+
bcrypt.BCryptOpenAlgorithmProvider(byref(hAlg), "AES\0".encode("utf-16-le"), None, 0)
|
| 141 |
+
mode = "ChainingModeCFB\0".encode("utf-16-le")
|
| 142 |
+
bcrypt.BCryptSetProperty(hAlg, "ChainingMode\0".encode("utf-16-le"), mode, len(mode), 0)
|
| 143 |
+
|
| 144 |
+
if mbl is not None:
|
| 145 |
+
mbl_val = c_ulong(mbl)
|
| 146 |
+
bcrypt.BCryptSetProperty(hAlg, "MessageBlockLength\0".encode("utf-16-le"),
|
| 147 |
+
byref(mbl_val), 4, 0)
|
| 148 |
+
|
| 149 |
+
blob = struct.pack('<III', 0x4d42444b, 1, len(KEY)) + KEY
|
| 150 |
+
hKey = c_void_p()
|
| 151 |
+
status = bcrypt.BCryptImportKey(
|
| 152 |
+
hAlg, None, "KeyDataBlob\0".encode("utf-16-le"),
|
| 153 |
+
byref(hKey), None, 0, blob, len(blob), 0
|
| 154 |
+
)
|
| 155 |
+
if status != 0:
|
| 156 |
+
print(" [{}] ImportKey failed: {:#010x}".format(tag, status))
|
| 157 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
iv_dec = bytearray(IV)
|
| 161 |
+
encrypted_chunk = file_data[24:24 + 32]
|
| 162 |
+
result_size = c_ulong(0)
|
| 163 |
+
bcrypt.BCryptDecrypt(hKey, encrypted_chunk, 32, None, None, 0,
|
| 164 |
+
None, 0, byref(result_size), 0)
|
| 165 |
+
output = (ctypes.c_ubyte * result_size.value)()
|
| 166 |
+
actual = c_ulong(0)
|
| 167 |
+
iv_buf = (ctypes.c_ubyte * 16)(*iv_dec)
|
| 168 |
+
status = bcrypt.BCryptDecrypt(hKey, encrypted_chunk, 32, None,
|
| 169 |
+
iv_buf, 16, output, result_size.value, byref(actual), 0)
|
| 170 |
+
dec = bytes(output[:actual.value])
|
| 171 |
+
match = dec[:2] == b"DX"
|
| 172 |
+
mark = "*** MATCH! ***" if match else ""
|
| 173 |
+
print(" [{}] Decrypt: {} {} {}".format(tag, dec[:16].hex(), "OK DX" if match else "FAIL", mark))
|
| 174 |
+
|
| 175 |
+
bcrypt.BCryptDestroyKey(hKey)
|
| 176 |
+
bcrypt.BCryptCloseAlgorithmProvider(hAlg, 0)
|
| 177 |
+
|
| 178 |
+
print()
|
| 179 |
+
print("=" * 70)
|
| 180 |
+
print("If no method matched, need to hook BCryptSetProperty in the DLL")
|
| 181 |
+
print("to discover ALL properties set before BCryptDecrypt is called.")
|
_archive/attempts/verify_key_derivation.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Verify key derivation and analyze DX index structure.
|
| 3 |
+
Proven scheme:
|
| 4 |
+
DX key = SHA256(master_key_32 + file[8:24])
|
| 5 |
+
AES-256-CFB128, IV = "Copyright @ OneO"
|
| 6 |
+
"""
|
| 7 |
+
import hashlib
|
| 8 |
+
import struct
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from Crypto.Cipher import AES
|
| 11 |
+
|
| 12 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 13 |
+
IV = b"Copyright @ OneO"
|
| 14 |
+
|
| 15 |
+
file_data = Path("ocr_data/oneocr.onemodel").read_bytes()
|
| 16 |
+
|
| 17 |
+
# Step 1: Derive DX key
|
| 18 |
+
header_hash = file_data[8:24]
|
| 19 |
+
derived_key = hashlib.sha256(KEY + header_hash).digest()
|
| 20 |
+
print(f"DX derived key: {derived_key.hex()}")
|
| 21 |
+
|
| 22 |
+
# Step 2: Decrypt DX index
|
| 23 |
+
encrypted_dx = file_data[24:24 + 22624]
|
| 24 |
+
cipher = AES.new(derived_key, AES.MODE_CFB, iv=IV, segment_size=128)
|
| 25 |
+
dx = cipher.decrypt(encrypted_dx)
|
| 26 |
+
|
| 27 |
+
assert dx[:2] == b"DX", "DX header mismatch!"
|
| 28 |
+
valid_size = struct.unpack('<Q', dx[8:16])[0]
|
| 29 |
+
print(f"DX valid size: {valid_size}, starts with DX: OK")
|
| 30 |
+
|
| 31 |
+
# Step 3: Hex dump
|
| 32 |
+
print(f"\nDX hex dump (first 512 bytes):")
|
| 33 |
+
for i in range(0, min(512, len(dx)), 16):
|
| 34 |
+
chunk = dx[i:i+16]
|
| 35 |
+
hex_str = ' '.join(f'{b:02x}' for b in chunk)
|
| 36 |
+
ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in chunk)
|
| 37 |
+
print(f" {i:04x}: {hex_str:<48s} {ascii_str}")
|
| 38 |
+
|
| 39 |
+
# Step 4: Search for known hash inputs from hook data
|
| 40 |
+
print(f"\n--- Searching for hash input patterns in DX ---")
|
| 41 |
+
patterns = {
|
| 42 |
+
"Chunk1(config)": "7f2e000000000000972e0000000000003fe51f12a6d7432577c9b6b367b1ff4d",
|
| 43 |
+
"Chunk2(encrypt)": "78000000000000009000000000000000",
|
| 44 |
+
"Chunk3(bigONNX)": "7f4bb00000000000974bb00000000000165e6ebce48ad4c5b45554019f6cefe8",
|
| 45 |
+
"Chunk4(ONNX)": "5c000000000000007400000000000000",
|
| 46 |
+
"Chunk5(ONNX2)": "63000000000000007b00000000000000",
|
| 47 |
+
"Chunk6(ONNX3)": "69bf34000000000081bf340000000000c7ed80dc84ea4fc4a891feae316ccc8e",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
for name, hex_pat in patterns.items():
|
| 51 |
+
target = bytes.fromhex(hex_pat)
|
| 52 |
+
pos = dx.find(target)
|
| 53 |
+
if pos >= 0:
|
| 54 |
+
print(f" {name}: found at DX offset {pos} ({pos:#x})")
|
| 55 |
+
else:
|
| 56 |
+
print(f" {name}: NOT found in DX (len={len(target)})")
|
| 57 |
+
|
| 58 |
+
# Step 5: Analyze DX structure around container header magic
|
| 59 |
+
magic = bytes.fromhex("4a1a082b25000000")
|
| 60 |
+
print(f"\nContainer magic 4a1a082b25000000 locations:")
|
| 61 |
+
pos = 0
|
| 62 |
+
while True:
|
| 63 |
+
pos = dx.find(magic, pos)
|
| 64 |
+
if pos < 0:
|
| 65 |
+
break
|
| 66 |
+
# Read surrounding context
|
| 67 |
+
ctx = dx[pos:pos+40]
|
| 68 |
+
print(f" offset {pos} ({pos:#x}): {ctx.hex()}")
|
| 69 |
+
pos += 1
|
| 70 |
+
|
| 71 |
+
# Step 6: Parse DX as record-based structure
|
| 72 |
+
# Looking at the structure:
|
| 73 |
+
# Offset 0-7: "DX\x00\x00\x00\x00\x00\x00"
|
| 74 |
+
# Offset 8-15: valid_size (uint64) = 22620
|
| 75 |
+
# Offset 16-23: container magic = 4a1a082b25000000
|
| 76 |
+
# Offset 24-31: uint64 = 0x2ea7 = 11943
|
| 77 |
+
# Let's see what's after that
|
| 78 |
+
|
| 79 |
+
print(f"\n--- DX parsed fields ---")
|
| 80 |
+
off = 0
|
| 81 |
+
print(f" [{off}] Magic: {dx[off:off+8]}")
|
| 82 |
+
off = 8
|
| 83 |
+
print(f" [{off}] ValidSize: {struct.unpack('<Q', dx[off:off+8])[0]}")
|
| 84 |
+
off = 16
|
| 85 |
+
print(f" [{off}] ContainerMagic: {dx[off:off+8].hex()}")
|
| 86 |
+
off = 24
|
| 87 |
+
print(f" [{off}] Value: {struct.unpack('<Q', dx[off:off+8])[0]}")
|
| 88 |
+
off = 32
|
| 89 |
+
|
| 90 |
+
# Look for uint64 pairs that were hash inputs
|
| 91 |
+
# The 16-byte patterns are two uint64 LE values
|
| 92 |
+
# The 32-byte patterns are two uint64 LE + 16-byte hash
|
| 93 |
+
# Let me scan for all pairs of uint64 in DX and see structure
|
| 94 |
+
|
| 95 |
+
# Save DX for manual analysis
|
| 96 |
+
Path("temp").mkdir(exist_ok=True)
|
| 97 |
+
Path("temp/dx_index_decrypted.bin").write_bytes(dx)
|
| 98 |
+
print(f"\nSaved DX to temp/dx_index_decrypted.bin ({len(dx)} bytes)")
|
_archive/attempts/verify_models.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Verify extracted .bin files as valid ONNX models."""
|
| 2 |
+
import os
|
| 3 |
+
import struct
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
EXTRACT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\extracted_models")
|
| 7 |
+
VERIFIED_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\verified_models")
|
| 8 |
+
VERIFIED_DIR.mkdir(exist_ok=True)
|
| 9 |
+
|
| 10 |
+
def try_parse_onnx_protobuf(data: bytes) -> dict | None:
|
| 11 |
+
"""Try to parse the first few fields of an ONNX ModelProto protobuf."""
|
| 12 |
+
# ONNX ModelProto:
|
| 13 |
+
# field 1 (varint) = ir_version
|
| 14 |
+
# field 2 (len-delimited) = opset_import (repeated)
|
| 15 |
+
# field 3 (len-delimited) = producer_name
|
| 16 |
+
# field 4 (len-delimited) = producer_version
|
| 17 |
+
# field 5 (len-delimited) = domain
|
| 18 |
+
# field 6 (varint) = model_version
|
| 19 |
+
# field 7 (len-delimited) = doc_string
|
| 20 |
+
# field 8 (len-delimited) = graph (GraphProto)
|
| 21 |
+
|
| 22 |
+
if len(data) < 4:
|
| 23 |
+
return None
|
| 24 |
+
|
| 25 |
+
pos = 0
|
| 26 |
+
result = {}
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# Field 1: ir_version (varint, field tag = 0x08)
|
| 30 |
+
if data[pos] != 0x08:
|
| 31 |
+
return None
|
| 32 |
+
pos += 1
|
| 33 |
+
|
| 34 |
+
# Read varint
|
| 35 |
+
ir_version = 0
|
| 36 |
+
shift = 0
|
| 37 |
+
while pos < len(data):
|
| 38 |
+
b = data[pos]
|
| 39 |
+
pos += 1
|
| 40 |
+
ir_version |= (b & 0x7F) << shift
|
| 41 |
+
if not (b & 0x80):
|
| 42 |
+
break
|
| 43 |
+
shift += 7
|
| 44 |
+
|
| 45 |
+
if ir_version < 1 or ir_version > 12:
|
| 46 |
+
return None
|
| 47 |
+
result['ir_version'] = ir_version
|
| 48 |
+
|
| 49 |
+
# Next field - check tag
|
| 50 |
+
if pos >= len(data):
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
tag = data[pos]
|
| 54 |
+
field_num = tag >> 3
|
| 55 |
+
wire_type = tag & 0x07
|
| 56 |
+
|
| 57 |
+
# We expect field 2 (opset_import, len-delimited, tag=0x12) or
|
| 58 |
+
# field 3 (producer_name, len-delimited, tag=0x1a)
|
| 59 |
+
if tag == 0x12: # field 2, length-delimited
|
| 60 |
+
pos += 1
|
| 61 |
+
# Read length varint
|
| 62 |
+
length = 0
|
| 63 |
+
shift = 0
|
| 64 |
+
while pos < len(data):
|
| 65 |
+
b = data[pos]
|
| 66 |
+
pos += 1
|
| 67 |
+
length |= (b & 0x7F) << shift
|
| 68 |
+
if not (b & 0x80):
|
| 69 |
+
break
|
| 70 |
+
shift += 7
|
| 71 |
+
|
| 72 |
+
if length > 0 and length < len(data):
|
| 73 |
+
result['has_opset_or_producer'] = True
|
| 74 |
+
result['next_field_len'] = length
|
| 75 |
+
else:
|
| 76 |
+
return None
|
| 77 |
+
elif tag == 0x1a: # field 3, length-delimited
|
| 78 |
+
pos += 1
|
| 79 |
+
length = 0
|
| 80 |
+
shift = 0
|
| 81 |
+
while pos < len(data):
|
| 82 |
+
b = data[pos]
|
| 83 |
+
pos += 1
|
| 84 |
+
length |= (b & 0x7F) << shift
|
| 85 |
+
if not (b & 0x80):
|
| 86 |
+
break
|
| 87 |
+
shift += 7
|
| 88 |
+
|
| 89 |
+
if length > 0 and length < 1000:
|
| 90 |
+
producer = data[pos:pos+length]
|
| 91 |
+
try:
|
| 92 |
+
result['producer_name'] = producer.decode('utf-8', errors='strict')
|
| 93 |
+
except:
|
| 94 |
+
result['producer_name'] = f"<binary {length}b>"
|
| 95 |
+
result['has_opset_or_producer'] = True
|
| 96 |
+
else:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
return result
|
| 100 |
+
|
| 101 |
+
except (IndexError, ValueError):
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def check_onnx_with_lib(filepath: str) -> dict | None:
|
| 106 |
+
"""Try loading with onnx library."""
|
| 107 |
+
try:
|
| 108 |
+
import onnx
|
| 109 |
+
model = onnx.load(filepath)
|
| 110 |
+
return {
|
| 111 |
+
'ir_version': model.ir_version,
|
| 112 |
+
'producer': model.producer_name,
|
| 113 |
+
'model_version': model.model_version,
|
| 114 |
+
'opset': [f"{o.domain or 'ai.onnx'}:{o.version}" for o in model.opset_import],
|
| 115 |
+
'graph_name': model.graph.name if model.graph else None,
|
| 116 |
+
'num_nodes': len(model.graph.node) if model.graph else 0,
|
| 117 |
+
'num_inputs': len(model.graph.input) if model.graph else 0,
|
| 118 |
+
'num_outputs': len(model.graph.output) if model.graph else 0,
|
| 119 |
+
}
|
| 120 |
+
except Exception as e:
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# Phase 1: Quick protobuf header scan
|
| 125 |
+
print("=" * 70)
|
| 126 |
+
print("PHASE 1: Quick protobuf header scan")
|
| 127 |
+
print("=" * 70)
|
| 128 |
+
|
| 129 |
+
candidates = []
|
| 130 |
+
files = sorted(EXTRACT_DIR.glob("*.bin"), key=lambda f: f.stat().st_size, reverse=True)
|
| 131 |
+
print(f"Total files: {len(files)}")
|
| 132 |
+
|
| 133 |
+
for f in files:
|
| 134 |
+
size = f.stat().st_size
|
| 135 |
+
if size < 1000: # Skip tiny files
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
with open(f, 'rb') as fh:
|
| 139 |
+
header = fh.read(256)
|
| 140 |
+
|
| 141 |
+
info = try_parse_onnx_protobuf(header)
|
| 142 |
+
if info and info.get('ir_version', 0) >= 3:
|
| 143 |
+
candidates.append((f, size, info))
|
| 144 |
+
|
| 145 |
+
print(f"Candidates with valid ONNX protobuf header: {len(candidates)}")
|
| 146 |
+
print()
|
| 147 |
+
|
| 148 |
+
# Group by ir_version
|
| 149 |
+
from collections import Counter
|
| 150 |
+
ir_counts = Counter(c[2]['ir_version'] for c in candidates)
|
| 151 |
+
print("IR version distribution:")
|
| 152 |
+
for v, cnt in sorted(ir_counts.items()):
|
| 153 |
+
total_size = sum(c[1] for c in candidates if c[2]['ir_version'] == v)
|
| 154 |
+
print(f" ir_version={v}: {cnt} files, total {total_size/1024/1024:.1f} MB")
|
| 155 |
+
|
| 156 |
+
# Phase 2: Try onnx.load on top candidates (by size, unique sizes to avoid duplicates)
|
| 157 |
+
print()
|
| 158 |
+
print("=" * 70)
|
| 159 |
+
print("PHASE 2: Verify with onnx library (top candidates by size)")
|
| 160 |
+
print("=" * 70)
|
| 161 |
+
|
| 162 |
+
# Take unique sizes - many files may be near-duplicates from overlapping memory
|
| 163 |
+
seen_sizes = set()
|
| 164 |
+
unique_candidates = []
|
| 165 |
+
for f, size, info in candidates:
|
| 166 |
+
# Round to nearest 1KB to detect near-duplicates
|
| 167 |
+
size_key = size // 1024
|
| 168 |
+
if size_key not in seen_sizes:
|
| 169 |
+
seen_sizes.add(size_key)
|
| 170 |
+
unique_candidates.append((f, size, info))
|
| 171 |
+
|
| 172 |
+
print(f"Unique-size candidates: {len(unique_candidates)}")
|
| 173 |
+
print()
|
| 174 |
+
|
| 175 |
+
verified = []
|
| 176 |
+
for i, (f, size, info) in enumerate(unique_candidates[:50]): # Check top 50 by size
|
| 177 |
+
result = check_onnx_with_lib(str(f))
|
| 178 |
+
if result:
|
| 179 |
+
verified.append((f, size, result))
|
| 180 |
+
print(f" VALID ONNX: {f.name}")
|
| 181 |
+
print(f" Size: {size/1024:.0f} KB")
|
| 182 |
+
print(f" ir={result['ir_version']} producer='{result['producer']}' "
|
| 183 |
+
f"opset={result['opset']}")
|
| 184 |
+
print(f" graph='{result['graph_name']}' nodes={result['num_nodes']} "
|
| 185 |
+
f"inputs={result['num_inputs']} outputs={result['num_outputs']}")
|
| 186 |
+
|
| 187 |
+
# Copy to verified dir
|
| 188 |
+
import shutil
|
| 189 |
+
dest_name = f"model_{len(verified):02d}_ir{result['ir_version']}_{result['graph_name'] or 'unknown'}_{size//1024}KB.onnx"
|
| 190 |
+
# Clean filename
|
| 191 |
+
dest_name = dest_name.replace('/', '_').replace('\\', '_').replace(':', '_')
|
| 192 |
+
dest = VERIFIED_DIR / dest_name
|
| 193 |
+
shutil.copy2(f, dest)
|
| 194 |
+
print(f" -> Saved as {dest_name}")
|
| 195 |
+
print()
|
| 196 |
+
|
| 197 |
+
if not verified:
|
| 198 |
+
print(" No files passed onnx.load validation in top 50.")
|
| 199 |
+
print()
|
| 200 |
+
# Try even more
|
| 201 |
+
print(" Trying ALL candidates...")
|
| 202 |
+
for i, (f, size, info) in enumerate(unique_candidates):
|
| 203 |
+
if i < 50:
|
| 204 |
+
continue
|
| 205 |
+
result = check_onnx_with_lib(str(f))
|
| 206 |
+
if result:
|
| 207 |
+
verified.append((f, size, result))
|
| 208 |
+
print(f" VALID ONNX: {f.name}")
|
| 209 |
+
print(f" Size: {size/1024:.0f} KB, ir={result['ir_version']}, "
|
| 210 |
+
f"producer='{result['producer']}', nodes={result['num_nodes']}")
|
| 211 |
+
|
| 212 |
+
import shutil
|
| 213 |
+
dest_name = f"model_{len(verified):02d}_ir{result['ir_version']}_{result['graph_name'] or 'unknown'}_{size//1024}KB.onnx"
|
| 214 |
+
dest_name = dest_name.replace('/', '_').replace('\\', '_').replace(':', '_')
|
| 215 |
+
dest = VERIFIED_DIR / dest_name
|
| 216 |
+
shutil.copy2(f, dest)
|
| 217 |
+
|
| 218 |
+
print()
|
| 219 |
+
print("=" * 70)
|
| 220 |
+
print(f"SUMMARY: {len(verified)} verified ONNX models out of {len(candidates)} candidates")
|
| 221 |
+
print("=" * 70)
|
| 222 |
+
|
| 223 |
+
if verified:
|
| 224 |
+
total_size = sum(v[1] for v in verified)
|
| 225 |
+
print(f"Total size: {total_size/1024/1024:.1f} MB")
|
| 226 |
+
for f, size, result in verified:
|
| 227 |
+
print(f" {f.name}: {size/1024:.0f}KB, {result['num_nodes']} nodes, "
|
| 228 |
+
f"graph='{result['graph_name']}'")
|
_archive/brainstorm.md
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ⚡ Skill: Brainstorm
|
| 2 |
+
|
| 3 |
+
> **Kategoria:** analysis | **Trudność:** advanced
|
| 4 |
+
> **Tokens:** ~2500 | **Model:** any (zalecany: Claude / GPT-4+)
|
| 5 |
+
> **Wersja:** 1.0.0 | **Utworzono:** 2026-02-10
|
| 6 |
+
> **Komendy aktywacyjne:** `mały brainstorm` | `duży brainstorm`
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Kiedy używać
|
| 11 |
+
|
| 12 |
+
Gdy potrzebujesz **dogłębnie przemyśleć** problem, pomysł, decyzję, strategię lub architekturę — zamiast od razu działać. Brainstorm to faza deliberatywna przed fazą wykonawczą.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## Tryby
|
| 17 |
+
|
| 18 |
+
| Tryb | Komenda | Długość outputu | Zastosowanie |
|
| 19 |
+
|------|---------|----------------|--------------|
|
| 20 |
+
| 🟢 Mały | `mały brainstorm` | ~500 linii (~2-4 stron A4) | Szybkie przemyślenie tematu, decyzja, pros/cons |
|
| 21 |
+
| 🔴 Duży | `duży brainstorm` | ~1000-2000 linii (~6-15 stron A4) | Głębokie planowanie, architektura, strategia, multi-dimensional analysis |
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## Rola (System Prompt)
|
| 26 |
+
|
| 27 |
+
<role>
|
| 28 |
+
Jesteś **Strategic Brainstorm Architect** — ekspert od deliberatywnego myślenia, analizy wielowymiarowej i systematycznej ewaluacji pomysłów. Łączysz techniki **Chain-of-Thought** (krokowe rozumowanie), **Tree-of-Thought** (rozgałęziona eksploracja z backtrackingiem) oraz **kreatywną dywergencję** (generowanie nieoczywistych rozwiązań).
|
| 29 |
+
|
| 30 |
+
**Twoja misja:** Nie odpowiadaj od razu — **MYŚL GŁĘBOKO**, eksploruj przestrzeń rozwiązań, oceniaj, eliminuj, syntetyzuj. Brainstorm to Twoja arena, a rezultatem jest treść, której user nie wygeneruje sam.
|
| 31 |
+
|
| 32 |
+
**Kompetencje kluczowe:**
|
| 33 |
+
- Wielowymiarowa analiza problemów (techniczne, biznesowe, ludzkie, czasowe)
|
| 34 |
+
- Generowanie 5-15+ rozwiązań/podejść na każdy problem (dywergencja)
|
| 35 |
+
- Krytyczna ewaluacja z użyciem skal, matryc i metryk (konwergencja)
|
| 36 |
+
- Eksploracja repozytorium i kontekstu projektu nim zaczniesz myśleć
|
| 37 |
+
- Identyfikacja ukrytych ryzyk, zależności i efektów drugiego rzędu
|
| 38 |
+
- Synteza: wybór najlepszej opcji z jasnym uzasadnieniem "dlaczego"
|
| 39 |
+
|
| 40 |
+
**Zasady pracy:**
|
| 41 |
+
- 🔍 **Kontekst first** — ZANIM zaczniesz brainstorm: przeskanuj repozytorium, przeczytaj README, zrozum co user buduje, zbierz kontekst, czasem użuj narzędzia do ankiety i zapytaj usera
|
| 42 |
+
- 🌐 **Szukaj w sieci** — jeśli masz dostęp do wyszukiwania, UŻYWAJ GO aktywnie. Sprawdzaj trendy, best practices, istniejące rozwiązania, benchmarki
|
| 43 |
+
- 🧠 **Self-prompting** — zadawaj SOBIE pytania pomocnicze w trakcie myślenia: "Czego jeszcze nie rozważyłem?", "Jakie jest drugie dno?", "Co by powiedział ekspert od X?"
|
| 44 |
+
- 🎨 **Uwolnij kreatywność** — generuj też rozwiązania niestandardowe, śmiałe, eksperymentalne — nawet jeśli ryzykowne
|
| 45 |
+
- 📏 **Tablica prawdy** — wyznaczone przez usera ŚWIĘTE ZASADY (constraints) są ABSOLUTNE — nigdy ich nie łam
|
| 46 |
+
- ⭐ **Oceniaj wszystko** — każde rozwiązanie/pomysł dostaje ocenę gwiazdkową 1-10
|
| 47 |
+
- 🔄 **Iteruj** — wracaj do wcześniejszych pomysłów w świetle nowych odkryć (backtracking ToT)
|
| 48 |
+
</role>
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## Instrukcje
|
| 53 |
+
|
| 54 |
+
<instructions>
|
| 55 |
+
|
| 56 |
+
### 📋 Struktura Brainstormu (Output)
|
| 57 |
+
|
| 58 |
+
Brainstorm generuje **2 pliki .md**:
|
| 59 |
+
|
| 60 |
+
**Plik 1:** `BRAINSTORM_{TEMAT}.md` — pełny brainstorm (w `temp/brain_storm/`)
|
| 61 |
+
**Plik 2:** `BRAINSTORM_{TEMAT}_SUMMARY.md` — podsumowanie + lista zadań (w `temp/brain_storm/`)
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
### FAZA 0: Zbieranie Kontekstu (OBOWIĄZKOWE)
|
| 66 |
+
|
| 67 |
+
Zanim napiszesz choćby jeden nagłówek:
|
| 68 |
+
|
| 69 |
+
1. **Przeskanuj repozytorium** — przeczytaj README, strukturę folderów, kluczowe pliki
|
| 70 |
+
2. **Zrozum kontekst usera** — kim jest, co buduje, jaki ma cel (sprawdź knowledge/ jeśli istnieje)
|
| 71 |
+
3. **Przeczytaj pliki powiązane z tematem** — jeśli brainstorm dotyczy kodu → przeczytaj kod; jeśli strategii → przeczytaj plany
|
| 72 |
+
4. **Szukaj w sieci** (jeśli dostępne) — sprawdź trendy, istniejące rozwiązania, artykuły, benchmarki
|
| 73 |
+
5. **Zidentyfikuj ŚWIĘTE ZASADY usera** — ograniczenia, które NIE podlegają dyskusji (constraints/non-negotiables)
|
| 74 |
+
|
| 75 |
+
> 💡 **Self-prompt:** "Czy mam wystarczająco kontekstu? Czego mi brakuje? O co powinienem dopytać?"
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
### FAZA 1: Definicja Problemu i Tablicy Prawdy
|
| 80 |
+
|
| 81 |
+
```markdown
|
| 82 |
+
## 🎯 Definicja Problemu
|
| 83 |
+
[Jasne, precyzyjne sformułowanie: CO dokładnie brainstormujemy i DLACZEGO]
|
| 84 |
+
|
| 85 |
+
## 📐 Tablica Prawdy (Constraints)
|
| 86 |
+
| # | Święta Zasada (Non-Negotiable) | Źródło | Status |
|
| 87 |
+
|---|-------------------------------|--------|--------|
|
| 88 |
+
| 1 | [zasada usera] | user | 🔒 ABSOLUTNA |
|
| 89 |
+
| 2 | [zasada usera] | user | 🔒 ABSOLUTNA |
|
| 90 |
+
| 3 | [zasada kontekstu] | repo | 🔒 ABSOLUTNA |
|
| 91 |
+
|
| 92 |
+
> ⚠️ Każde rozwiązanie MUSI przejść test tablicy prawdy. Jeśli łamie choć jedną zasadę → ODRZUCONE.
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
### FAZA 2: Dywergencja — Generowanie Pomysłów (Tree-of-Thought)
|
| 98 |
+
|
| 99 |
+
Generuj **wiele** podejść/rozwiązań. Minimum:
|
| 100 |
+
- 🟢 Mały brainstorm: **5-8 pomysłów**
|
| 101 |
+
- 🔴 Duży brainstorm: **10-20+ pomysłów**
|
| 102 |
+
|
| 103 |
+
Dla każdego pomysłu:
|
| 104 |
+
|
| 105 |
+
```markdown
|
| 106 |
+
### 💡 Pomysł X: [Nazwa]
|
| 107 |
+
**Opis:** [2-5 zdań: na czym polega]
|
| 108 |
+
**Mechanizm:** [Jak to działa / jak to zrealizować]
|
| 109 |
+
**Mocne strony:** [Co jest genialne]
|
| 110 |
+
**Słabe strony:** [Co może nie zagrać]
|
| 111 |
+
**Ryzyko:** [Co może pójść nie tak]
|
| 112 |
+
**Ocena:** ⭐⭐⭐⭐⭐⭐⭐⭐☆☆ (8/10)
|
| 113 |
+
**Test tablicy prawdy:** ✅ Przeszedł / ❌ Narusza zasadę #X
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
> 💡 **Self-prompt w trakcie generowania:**
|
| 117 |
+
> - "Jakie rozwiązanie zaproponowałby ktoś z zupełnie innej branży?"
|
| 118 |
+
> - "Co jeśli odwrócę problem do góry nogami?"
|
| 119 |
+
> - "Jakie podejście jest najbardziej ryzykowne, ale też najbardziej obiecujące?"
|
| 120 |
+
> - "Czego bym NIE chciał tutaj zrobić — i dlaczego? Czy na pewno słusznie to wykluczam?"
|
| 121 |
+
|
| 122 |
+
**Kategorie pomysłów do rozważenia:**
|
| 123 |
+
- 🛡️ **Bezpieczne** — sprawdzone, niskie ryzyko, proven solutions
|
| 124 |
+
- 🚀 **Ambitne** — wymagające, ale z dużym potencjałem
|
| 125 |
+
- 🎲 **Eksperymentalne** — wildcard, innowacyjne, mogą nie zadziałać
|
| 126 |
+
- 🤝 **Hybrydowe** — kombinacja kilku podejść
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
### FAZA 3: Konwergencja — Ewaluacja i Ranking (Chain-of-Thought)
|
| 131 |
+
|
| 132 |
+
#### 3.1 Matryca Porównawcza
|
| 133 |
+
|
| 134 |
+
```markdown
|
| 135 |
+
## 📊 Matryca Porównawcza
|
| 136 |
+
|
| 137 |
+
| Kryterium | Waga | Pomysł 1 | Pomysł 2 | Pomysł 3 | ... |
|
| 138 |
+
|-----------|------|----------|----------|----------|-----|
|
| 139 |
+
| Wykonalność | 25% | ⭐⭐⭐⭐⭐⭐⭐⭐☆☆ | ⭐⭐⭐⭐⭐⭐☆☆☆☆ | ... | ... |
|
| 140 |
+
| ROI / Wartość | 25% | ⭐⭐⭐⭐⭐⭐⭐☆☆☆ | ⭐⭐⭐⭐⭐⭐⭐⭐⭐☆ | ... | ... |
|
| 141 |
+
| Ryzyko (niższe=lepsze) | 20% | ⭐⭐⭐⭐⭐⭐⭐⭐☆☆ | ⭐⭐⭐⭐☆☆☆☆☆☆ | ... | ... |
|
| 142 |
+
| Czas realizacji | 15% | ⭐⭐⭐⭐⭐⭐⭐☆☆☆ | ⭐⭐⭐⭐⭐⭐⭐⭐☆☆ | ... | ... |
|
| 143 |
+
| Innowacyjność | 15% | ⭐⭐⭐⭐⭐☆☆☆☆☆ | ⭐⭐⭐⭐⭐⭐⭐⭐⭐☆ | ... | ... |
|
| 144 |
+
| **SUMA WAŻONA** | 100% | **7.3** | **7.1** | ... | ... |
|
| 145 |
+
```
|
| 146 |
+
|
| 147 |
+
#### 3.2 Strategie Decyzyjne
|
| 148 |
+
|
| 149 |
+
Zastosuj **minimum 3** strategie ewaluacji do zestawu pomysłów:
|
| 150 |
+
|
| 151 |
+
| Strategia | Opis | Kiedy skuteczna |
|
| 152 |
+
|-----------|------|-----------------|
|
| 153 |
+
| **Eliminacja negatywna** | Odrzuć wszystko co łamie constraints → zobacz co zostaje | Gdy masz dużo opcji do filtrowania |
|
| 154 |
+
| **Pareto 80/20** | Który pomysł daje 80% rezultatu za 20% wysiłku? | Gdy czas/zasoby są ograniczone |
|
| 155 |
+
| **Premortum** | "Jest rok później, projekt się nie powiódł — DLACZEGO?" | Identyfikacja ukrytych ryzyk |
|
| 156 |
+
| **10/10/10** | Jak oceniam tę decyzję za 10 minut / 10 miesięcy / 10 lat? | Decyzje strategiczne z długim horyzontem |
|
| 157 |
+
| **Odwrócenie** | "Co by się stało gdybym wybrał NAJGORSZĄ opcję?" | Uświadamianie, że różnica między opcjami może być mała |
|
| 158 |
+
| **First Principles** | Rozbij problem na fundamentalne prawdy → buduj od zera | Gdy istniejące rozwiązania nie pasują |
|
| 159 |
+
| **Matryca Eisenhowera** | Pilne vs. Ważne → priorytety | Planowanie i roadmapa |
|
| 160 |
+
| **Red Team / Devil's Advocate** | Aktywnie atakuj swoją najlepszą opcję — co jest w niej złe? | Walidacja przed finalną decyzją |
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
### FAZA 4: Deep Dive — Analiza Top 3 (tylko duży brainstorm)
|
| 165 |
+
|
| 166 |
+
Dla **dużego brainstormu** — rozbudowana analiza 3 najlepszych pomysłów:
|
| 167 |
+
|
| 168 |
+
```markdown
|
| 169 |
+
## 🔬 Deep Dive: [Pomysł X]
|
| 170 |
+
|
| 171 |
+
### Plan implementacji
|
| 172 |
+
[Krok po kroku: co, jak, kiedy, kto]
|
| 173 |
+
|
| 174 |
+
### Zależności
|
| 175 |
+
[Co musi istnieć / być gotowe ZANIM to zrobimy]
|
| 176 |
+
|
| 177 |
+
### Potencjalne problemy i mitygacja
|
| 178 |
+
| Problem | Prawdopodobieństwo | Wpływ | Mitygacja |
|
| 179 |
+
|---------|-------------------|-------|-----------|
|
| 180 |
+
| [problem] | WYSOKIE/ŚREDNIE/NISKIE | KRYTYCZNY/ZNACZĄCY/MAŁY | [jak zapobiec] |
|
| 181 |
+
|
| 182 |
+
### Zasoby wymagane
|
| 183 |
+
[Czas, narzędzia, wiedza, ludzie]
|
| 184 |
+
|
| 185 |
+
### Metryki sukcesu
|
| 186 |
+
[Jak zmierzymy, że to działa?]
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
### FAZA 5: Rozpoznanie Terenu — Dobre vs. Złe (Podział Kontekstowy)
|
| 192 |
+
|
| 193 |
+
```markdown
|
| 194 |
+
## ✅❌ Podział Kontekstowy
|
| 195 |
+
|
| 196 |
+
### ✅ Potencjalnie DOBRE w tym kontekście
|
| 197 |
+
| # | Co | Dlaczego dobre | Warunek sukcesu |
|
| 198 |
+
|---|----|----------------|-----------------|
|
| 199 |
+
| 1 | [element] | [uzasadnienie] | [co musi zaistnieć] |
|
| 200 |
+
|
| 201 |
+
### ❌ Potencjalnie ZŁE w tym kontekście
|
| 202 |
+
| # | Co | Dlaczego złe | Kiedy mogłoby zadziałać |
|
| 203 |
+
|---|----|-------------|------------------------|
|
| 204 |
+
| 1 | [element] | [uzasadnienie] | [inny kontekst] |
|
| 205 |
+
|
| 206 |
+
### ⚠️ Zależy od kontekstu (może być dobre LUB złe)
|
| 207 |
+
| # | Co | Kiedy dobre | Kiedy złe |
|
| 208 |
+
|---|----|-------------|-----------|
|
| 209 |
+
| 1 | [element] | [warunek A] | [warunek B] |
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
---
|
| 213 |
+
|
| 214 |
+
### FAZA 6: Wybór Najlepszej Opcji (Final Verdict)
|
| 215 |
+
|
| 216 |
+
```markdown
|
| 217 |
+
## 🏆 REKOMENDACJA FINALNA
|
| 218 |
+
|
| 219 |
+
### Wybrany pomysł: [Nazwa]
|
| 220 |
+
**Ocena końcowa:** ⭐⭐⭐⭐⭐⭐⭐⭐⭐☆ (9/10)
|
| 221 |
+
|
| 222 |
+
### Dlaczego ten?
|
| 223 |
+
[3-5 zdań uzasadnienia — odwołuj się do matrycy, strategii i tablicy prawdy]
|
| 224 |
+
|
| 225 |
+
### Dlaczego NIE pozostałe?
|
| 226 |
+
[Krótko: co dyskwalifikuje top-2 i top-3]
|
| 227 |
+
|
| 228 |
+
### Plan B (fallback)
|
| 229 |
+
[Który pomysł jest backup'em i kiedy na niego przejść]
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
### FAZA 7: Podsumowanie + Generowanie Pliku Summary
|
| 235 |
+
|
| 236 |
+
Po napisaniu pełnego brainstormu — **STWÓRZ DRUGI PLIK**:
|
| 237 |
+
|
| 238 |
+
**`BRAINSTORM_{TEMAT}_SUMMARY.md`** zawiera:
|
| 239 |
+
|
| 240 |
+
```markdown
|
| 241 |
+
# 📋 SUMMARY: [Temat]
|
| 242 |
+
|
| 243 |
+
> **Źródło:** `BRAINSTORM_{TEMAT}.md`
|
| 244 |
+
> **Data:** [data] | **Tryb:** [mały/duży]
|
| 245 |
+
|
| 246 |
+
## TL;DR
|
| 247 |
+
[3-5 zdań: problem → rekomendacja → dlaczego]
|
| 248 |
+
|
| 249 |
+
## Rekomendacja
|
| 250 |
+
[Wybrany pomysł + uzasadnienie]
|
| 251 |
+
|
| 252 |
+
## Kluczowe Insights
|
| 253 |
+
1. [Insight 1]
|
| 254 |
+
2. [Insight 2]
|
| 255 |
+
3. [Insight 3]
|
| 256 |
+
|
| 257 |
+
## 📝 Lista Zadań (Actionable Steps)
|
| 258 |
+
|
| 259 |
+
### Priorytet: 🔴 KRYTYCZNY
|
| 260 |
+
- [ ] **Krok 1:** [Co dokładnie zrobić] → **Rezultat:** [co powinno powstać]
|
| 261 |
+
- [ ] **Krok 2:** [Co dokładnie zrobić] → **Rezultat:** [co powinno powstać]
|
| 262 |
+
|
| 263 |
+
### Priorytet: 🟡 WYSOKI
|
| 264 |
+
- [ ] **Krok 3:** [Co dokładnie zrobić] → **Rezultat:** [co powinno powstać]
|
| 265 |
+
- [ ] **Krok 4:** [Co dokładnie zrobić] → **Rezultat:** [co powinno powstać]
|
| 266 |
+
|
| 267 |
+
### Priorytet: 🟢 NORMALNY
|
| 268 |
+
- [ ] **Krok 5:** [Co dokładnie zrobić] → **Rezultat:** [co powinno powstać]
|
| 269 |
+
|
| 270 |
+
## Ryzyka do monitorowania
|
| 271 |
+
| Ryzyko | Trigger | Akcja |
|
| 272 |
+
|--------|---------|-------|
|
| 273 |
+
| [risk] | [kiedy reagować] | [co zrobić] |
|
| 274 |
+
|
| 275 |
+
## Otwarte pytania
|
| 276 |
+
- ❓ [Pytanie wymagające decyzji usera]
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
</instructions>
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## Ograniczenia
|
| 284 |
+
|
| 285 |
+
<constraints>
|
| 286 |
+
|
| 287 |
+
**Absolutne zasady (łamanie = fail):**
|
| 288 |
+
- ❌ **NIE pomijaj Fazy 0** (zbieranie kontekstu) — bez kontekstu brainstorm jest bezwartościowy
|
| 289 |
+
- ❌ **NIE łam Tablicy Prawdy** — constraints usera są ŚWIĘTE
|
| 290 |
+
- ❌ **NIE oceniaj bez uzasadnienia** — każda ocena gwiazdkowa musi mieć "dlaczego"
|
| 291 |
+
- ❌ **NIE kończ bez Summary** — ZAWSZE generuj 2 pliki (brainstorm + summary)
|
| 292 |
+
- ❌ **NIE generuj banalnych/oczywistych pomysłów** — twoja wartość to głębia, nie ilość
|
| 293 |
+
|
| 294 |
+
**Best practices (zawsze stosowane):**
|
| 295 |
+
- ✅ **Aktywnie szukaj w sieci** — jeśli masz narzędzia do wyszukiwania, UŻYWAJ ICH
|
| 296 |
+
- ✅ **Self-prompting** — regularnie zadawaj sobie pytania naprowadzające
|
| 297 |
+
- ✅ **Gwiazdki z uzasadnieniem** — ⭐ skala 1-10, ale ZAWSZE z komentarzem
|
| 298 |
+
- ✅ **Minimum 3 strategie decyzyjne** na fazę konwergencji
|
| 299 |
+
- ✅ **Emoji-driven structure** — użyj emoji jako wizualnych markerów sekcji
|
| 300 |
+
- ✅ **Backtracking** — wracaj do wcześniejszych pomysłów, jeśli nowe informacje zmieniają ocenę
|
| 301 |
+
- ✅ **Adaptuj kryteria** — dopasuj kryteria matrycy do konkretnego problemu (nie zawsze te same 5)
|
| 302 |
+
|
| 303 |
+
</constraints>
|
| 304 |
+
|
| 305 |
+
---
|
| 306 |
+
|
| 307 |
+
## Skala Gwiazdkowa (Referencja)
|
| 308 |
+
|
| 309 |
+
| Ocena | Gwiazdki | Znaczenie |
|
| 310 |
+
|-------|----------|-----------|
|
| 311 |
+
| 1/10 | ⭐☆☆☆☆☆☆☆☆☆ | Tragiczne — nie do użycia |
|
| 312 |
+
| 2/10 | ⭐⭐☆☆☆☆☆☆☆☆ | Bardzo słabe — poważne wady |
|
| 313 |
+
| 3/10 | ⭐⭐⭐☆☆☆☆☆☆☆ | Słabe — więcej wad niż zalet |
|
| 314 |
+
| 4/10 | ⭐⭐⭐⭐☆☆☆☆☆☆ | Poniżej średniej — ryzykowne |
|
| 315 |
+
| 5/10 | ⭐⭐⭐⭐⭐☆☆☆☆☆ | Średnie — OK ale nic specjalnego |
|
| 316 |
+
| 6/10 | ⭐⭐⭐⭐⭐⭐☆☆☆☆ | Przyzwoite — potencjał jest |
|
| 317 |
+
| 7/10 | ⭐⭐⭐⭐⭐⭐⭐☆☆☆ | Dobre — solidna opcja |
|
| 318 |
+
| 8/10 | ⭐⭐⭐⭐⭐⭐⭐⭐☆☆ | Bardzo dobre — mocna rekomendacja |
|
| 319 |
+
| 9/10 | ⭐⭐⭐⭐⭐⭐⭐⭐⭐☆ | Świetne — top tier |
|
| 320 |
+
| 10/10 | ⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐ | Perfekcyjne — rzadkość, uzasadnij wyjątkowo |
|
| 321 |
+
|
| 322 |
+
---
|
| 323 |
+
|
| 324 |
+
## Przykład użycia
|
| 325 |
+
|
| 326 |
+
**Input użytkownika:**
|
| 327 |
+
```
|
| 328 |
+
duży brainstorm: Jak zaprojektować system agentów AI do mojego repozytorium ProPrompts?
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
**Odpowiedź agenta:**
|
| 332 |
+
```
|
| 333 |
+
[Faza 0: Skanuje repozytorium, czyta README, MattyMroz.md, ZIP.md, istniejące agenty]
|
| 334 |
+
[Faza 1: Definiuje problem, tworzy tablicę prawdy z constraints usera]
|
| 335 |
+
[Faza 2: Generuje 12+ pomysłów z ocenami gwiazdkowymi]
|
| 336 |
+
[Faza 3: Matryca porównawcza + 4 strategie decyzyjne]
|
| 337 |
+
[Faza 4: Deep dive top 3 pomysłów]
|
| 338 |
+
[Faza 5: Podział kontekstowy dobre/złe]
|
| 339 |
+
[Faza 6: Finalna rekomendacja z uzasadnieniem]
|
| 340 |
+
[Faza 7: Tworzy BRAINSTORM_SYSTEM_AGENTOW_SUMMARY.md z listą zadań]
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## Warianty
|
| 346 |
+
|
| 347 |
+
- **Wariant A: Brainstorm techniczny** — focus na architekturze, kodzie, toolingu. Dodaj kryteria: performance, maintainability, scalability.
|
| 348 |
+
- **Wariant B: Brainstorm strategiczny** — focus na biznesie, rynku, decyzjach. Dodaj kryteria: ROI, market fit, competitive advantage.
|
| 349 |
+
- **Wariant C: Brainstorm kreatywny** — focus na pomysłach, naming, branding. Poluzuj rygory, maksymalizuj dywergencję (20+ pomysłów), używaj technik jak SCAMPER, lateral thinking.
|
| 350 |
+
|
| 351 |
+
---
|
| 352 |
+
|
| 353 |
+
## Changelog
|
| 354 |
+
|
| 355 |
+
- **v1.0.0** [2026-02-10]: Pierwsza wersja skilla brainstorm — pełna struktura 7-fazowa z trybami mały/duży
|
_archive/crack_config.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 7 |
+
|
| 8 |
+
# Load model_11
|
| 9 |
+
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))
|
| 10 |
+
|
| 11 |
+
# Get feature/config blob
|
| 12 |
+
config_blob = None
|
| 13 |
+
for init in model.graph.initializer:
|
| 14 |
+
if init.name == "feature/config":
|
| 15 |
+
config_blob = bytes(init.string_data[0])
|
| 16 |
+
break
|
| 17 |
+
|
| 18 |
+
print(f"Config blob size: {len(config_blob)} bytes")
|
| 19 |
+
print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}")
|
| 20 |
+
|
| 21 |
+
# Full float32 interpretation
|
| 22 |
+
all_floats = np.frombuffer(config_blob, dtype=np.float32)
|
| 23 |
+
print(f"\nFull blob as float32:")
|
| 24 |
+
print(f" Count: {len(all_floats)}")
|
| 25 |
+
print(f" Finite: {np.isfinite(all_floats).sum()}")
|
| 26 |
+
print(f" In [-10,10]: {np.sum(np.abs(all_floats) < 10)}")
|
| 27 |
+
print(f" Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]")
|
| 28 |
+
print(f" Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}")
|
| 29 |
+
print(f" First 20: {all_floats[:20]}")
|
| 30 |
+
|
| 31 |
+
# 4492 bytes / 4 = 1123 floats
|
| 32 |
+
# Hypothesis: some header + 21×50 weight matrix + 50 bias
|
| 33 |
+
# 1123 - 1050 - 50 = 23 extra floats (92 bytes header)
|
| 34 |
+
|
| 35 |
+
# Try different header sizes
|
| 36 |
+
for header_floats in range(0, 40):
|
| 37 |
+
remaining = len(all_floats) - header_floats
|
| 38 |
+
# Check if remaining = in_dim * out_dim + out_dim for some dimensions
|
| 39 |
+
for in_dim in [20, 21, 22]:
|
| 40 |
+
for out_dim in [48, 49, 50, 51, 52]:
|
| 41 |
+
needed = in_dim * out_dim + out_dim
|
| 42 |
+
if remaining == needed:
|
| 43 |
+
print(f"\n *** MATCH: header={header_floats} ({header_floats*4}B) + "
|
| 44 |
+
f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats")
|
| 45 |
+
W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim)
|
| 46 |
+
b = all_floats[header_floats + in_dim*out_dim:header_floats + needed]
|
| 47 |
+
print(f" W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}")
|
| 48 |
+
print(f" b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}")
|
| 49 |
+
|
| 50 |
+
if header_floats > 0:
|
| 51 |
+
header = all_floats[:header_floats]
|
| 52 |
+
print(f" Header values: {header}")
|
| 53 |
+
|
| 54 |
+
# Also try: the blob might encode multiple layers
|
| 55 |
+
# Or maybe it's quantized (int8/uint8)?
|
| 56 |
+
print(f"\n--- Trying int8 interpretation ---")
|
| 57 |
+
int8_arr = np.frombuffer(config_blob, dtype=np.int8)
|
| 58 |
+
print(f" int8 range: [{int8_arr.min()}, {int8_arr.max()}]")
|
| 59 |
+
|
| 60 |
+
uint8_arr = np.frombuffer(config_blob, dtype=np.uint8)
|
| 61 |
+
print(f" uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]")
|
| 62 |
+
|
| 63 |
+
# Maybe float16?
|
| 64 |
+
if len(config_blob) % 2 == 0:
|
| 65 |
+
f16_arr = np.frombuffer(config_blob, dtype=np.float16)
|
| 66 |
+
finite_f16 = np.isfinite(f16_arr).sum()
|
| 67 |
+
print(f" float16 count: {len(f16_arr)}, finite: {finite_f16}")
|
| 68 |
+
if finite_f16 > len(f16_arr) * 0.9:
|
| 69 |
+
print(f" float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]")
|
| 70 |
+
|
| 71 |
+
# Check the Slice in model_11 to understand input dimensions
|
| 72 |
+
print(f"\n--- Checking Slice constants to understand feature extraction ---")
|
| 73 |
+
for node in model.graph.node:
|
| 74 |
+
if node.op_type == "Constant":
|
| 75 |
+
for attr in node.attribute:
|
| 76 |
+
if attr.type == 4: # TENSOR
|
| 77 |
+
t = attr.t
|
| 78 |
+
data = onnx.numpy_helper.to_array(t)
|
| 79 |
+
print(f" Constant '{node.output[0]}': {data}")
|
| 80 |
+
|
| 81 |
+
# Check Add and Div constants
|
| 82 |
+
for node in model.graph.node:
|
| 83 |
+
if node.op_type in ("Add", "Div"):
|
| 84 |
+
print(f"\n {node.op_type}: {list(node.input)} → {list(node.output)}")
|
_archive/crack_endian.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Test big-endian float32 interpretation of OneOCRFeatureExtract config blob."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 7 |
+
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))
|
| 8 |
+
|
| 9 |
+
# Get config blob
|
| 10 |
+
for init in model.graph.initializer:
|
| 11 |
+
if init.name == "feature/config":
|
| 12 |
+
blob = bytes(init.string_data[0])
|
| 13 |
+
break
|
| 14 |
+
|
| 15 |
+
print(f"Blob: {len(blob)} bytes = {len(blob) // 4} float32s")
|
| 16 |
+
|
| 17 |
+
# Big-endian float32
|
| 18 |
+
be_arr = np.frombuffer(blob, dtype='>f4') # big-endian
|
| 19 |
+
le_arr = np.frombuffer(blob, dtype='<f4') # little-endian
|
| 20 |
+
|
| 21 |
+
print(f"\nBig-endian float32:")
|
| 22 |
+
print(f" Finite: {np.isfinite(be_arr).sum()} / {len(be_arr)}")
|
| 23 |
+
in_range = np.sum(np.abs(be_arr[np.isfinite(be_arr)]) < 10)
|
| 24 |
+
print(f" In [-10,10]: {in_range} ({100*in_range/len(be_arr):.1f}%)")
|
| 25 |
+
be_finite = be_arr[np.isfinite(be_arr)]
|
| 26 |
+
print(f" Mean: {be_finite.mean():.4f}, Std: {be_finite.std():.4f}")
|
| 27 |
+
print(f" Range: [{be_finite.min():.4f}, {be_finite.max():.4f}]")
|
| 28 |
+
print(f" First 20: {be_arr[:20]}")
|
| 29 |
+
|
| 30 |
+
print(f"\nLittle-endian float32:")
|
| 31 |
+
print(f" Finite: {np.isfinite(le_arr).sum()} / {len(le_arr)}")
|
| 32 |
+
in_range_le = np.sum(np.abs(le_arr[np.isfinite(le_arr)]) < 10)
|
| 33 |
+
print(f" In [-10,10]: {in_range_le} ({100*in_range_le/len(le_arr):.1f}%)")
|
| 34 |
+
|
| 35 |
+
# If big-endian works, try to extract 21×50 weight matrix + 50 bias
|
| 36 |
+
# 1123 total floats
|
| 37 |
+
# Check feasible dimensions
|
| 38 |
+
print(f"\n--- Dimension search for big-endian ---")
|
| 39 |
+
for header in range(0, 40):
|
| 40 |
+
remaining = len(be_arr) - header
|
| 41 |
+
for in_d in [20, 21, 22]:
|
| 42 |
+
for out_d in [48, 49, 50, 51, 52]:
|
| 43 |
+
if remaining == in_d * out_d + out_d:
|
| 44 |
+
W = be_arr[header:header + in_d*out_d].reshape(in_d, out_d)
|
| 45 |
+
b = be_arr[header + in_d*out_d:]
|
| 46 |
+
w_finite = np.isfinite(W).sum()
|
| 47 |
+
w_reasonable = np.sum(np.abs(W[np.isfinite(W)]) < 10)
|
| 48 |
+
if w_reasonable > in_d * out_d * 0.7:
|
| 49 |
+
print(f" *** header={header} + W[{in_d}×{out_d}] + b[{out_d}]")
|
| 50 |
+
print(f" W finite={w_finite}, reasonable={w_reasonable}")
|
| 51 |
+
print(f" W range: [{W[np.isfinite(W)].min():.4f}, {W[np.isfinite(W)].max():.4f}]")
|
| 52 |
+
print(f" b range: [{b[np.isfinite(b)].min():.4f}, {b[np.isfinite(b)].max():.4f}]")
|
| 53 |
+
|
| 54 |
+
# Also test: could be byteswapped structure with header
|
| 55 |
+
# Try offset by checking where the "nice" values start
|
| 56 |
+
print(f"\n--- Finding good float32 regions (big-endian) ---")
|
| 57 |
+
for start_byte in range(0, 100, 4):
|
| 58 |
+
chunk = np.frombuffer(blob[start_byte:start_byte+84], dtype='>f4')
|
| 59 |
+
all_reasonable = all(np.isfinite(chunk)) and all(np.abs(chunk) < 10)
|
| 60 |
+
if all_reasonable:
|
| 61 |
+
print(f" offset={start_byte}: ALL 21 values reasonable: {chunk}")
|
| 62 |
+
break
|
| 63 |
+
decent = np.sum((np.abs(chunk) < 10) & np.isfinite(chunk))
|
| 64 |
+
if decent >= 18:
|
| 65 |
+
print(f" offset={start_byte}: {decent}/21 reasonable: {chunk}")
|
_archive/debug_detector.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Debug detector output to understand word segmentation."""
|
| 2 |
+
import numpy as np
|
| 3 |
+
import onnxruntime as ort
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 8 |
+
img = Image.open("image.png").convert("RGB")
|
| 9 |
+
w, h = img.size
|
| 10 |
+
|
| 11 |
+
# Detector setup
|
| 12 |
+
sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
|
| 13 |
+
providers=["CPUExecutionProvider"])
|
| 14 |
+
|
| 15 |
+
scale = 800 / max(h, w)
|
| 16 |
+
dh = (int(h * scale) + 31) // 32 * 32
|
| 17 |
+
dw = (int(w * scale) + 31) // 32 * 32
|
| 18 |
+
img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
|
| 19 |
+
img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
|
| 20 |
+
data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
|
| 21 |
+
im_info = np.array([[dh, dw, scale]], dtype=np.float32)
|
| 22 |
+
|
| 23 |
+
outputs = sess.run(None, {"data": data, "im_info": im_info})
|
| 24 |
+
output_names = [o.name for o in sess.get_outputs()]
|
| 25 |
+
out_dict = dict(zip(output_names, outputs))
|
| 26 |
+
|
| 27 |
+
# Analyze FPN2 (highest resolution)
|
| 28 |
+
pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200]
|
| 29 |
+
link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200]
|
| 30 |
+
|
| 31 |
+
print(f"FPN2 shape: {pixel_scores.shape}")
|
| 32 |
+
print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")
|
| 33 |
+
|
| 34 |
+
# Find text region
|
| 35 |
+
text_mask = pixel_scores > 0.6
|
| 36 |
+
print(f"Text pixels (>0.6): {text_mask.sum()}")
|
| 37 |
+
|
| 38 |
+
# Get the row/column range of text pixels
|
| 39 |
+
ys, xs = np.where(text_mask)
|
| 40 |
+
if len(ys) > 0:
|
| 41 |
+
print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")
|
| 42 |
+
|
| 43 |
+
# Check link scores within text region - do they separate words?
|
| 44 |
+
# Link 2 is East neighbor (right), Link 6 is West neighbor (left)
|
| 45 |
+
# If link between words is low, they should separate
|
| 46 |
+
row_mid = (ys.min() + ys.max()) // 2
|
| 47 |
+
print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
|
| 48 |
+
link_east = link_scores[2, row_mid, :] # E neighbor
|
| 49 |
+
for x in range(xs.min(), xs.max()+1):
|
| 50 |
+
ps = pixel_scores[row_mid, x]
|
| 51 |
+
le = link_east[x]
|
| 52 |
+
marker = "TEXT" if ps > 0.6 else " "
|
| 53 |
+
link_marker = "LINK" if le > 0.5 else "gap "
|
| 54 |
+
if ps > 0.3:
|
| 55 |
+
print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")
|
| 56 |
+
|
| 57 |
+
# Also check if there are distinct "gap" regions in pixel scores
|
| 58 |
+
print(f"\nPixel scores along row {row_mid}:")
|
| 59 |
+
for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
|
| 60 |
+
ps = pixel_scores[row_mid, x]
|
| 61 |
+
bar = "█" * int(ps * 40)
|
| 62 |
+
print(f" col={x:3d}: {ps:.3f} {bar}")
|
| 63 |
+
|
| 64 |
+
# Try different thresholds
|
| 65 |
+
for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
|
| 66 |
+
mask = pixel_scores > thresh
|
| 67 |
+
n = mask.sum()
|
| 68 |
+
# Connected components using simple scan
|
| 69 |
+
from scipy import ndimage
|
| 70 |
+
try:
|
| 71 |
+
labels, n_comps = ndimage.label(mask)
|
| 72 |
+
print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
|
| 73 |
+
for c in range(1, min(n_comps+1, 10)):
|
| 74 |
+
comp_mask = labels == c
|
| 75 |
+
area = comp_mask.sum()
|
| 76 |
+
ys_c, xs_c = np.where(comp_mask)
|
| 77 |
+
print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
|
| 78 |
+
except ImportError:
|
| 79 |
+
# Fallback without scipy
|
| 80 |
+
print(f"Threshold {thresh}: {n} pixels")
|
_archive/decode_config.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Decode OneOCRFeatureExtract config blob."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
import struct
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
m = onnx.load('oneocr_extracted/onnx_models/model_11_ir6_1.9_26KB.onnx')
|
| 8 |
+
|
| 9 |
+
for init in m.graph.initializer:
|
| 10 |
+
if init.name == 'feature/config':
|
| 11 |
+
raw = init.string_data[0]
|
| 12 |
+
print(f'Total bytes: {len(raw)}')
|
| 13 |
+
print(f'First 100 bytes hex: {raw[:100].hex()}')
|
| 14 |
+
|
| 15 |
+
# Try different structure interpretations
|
| 16 |
+
for offset in [0, 4, 8, 12]:
|
| 17 |
+
vals = struct.unpack_from('<4f', raw, offset)
|
| 18 |
+
print(f'Offset {offset:3d} as 4xfloat32: {vals}')
|
| 19 |
+
|
| 20 |
+
# Parse rnn_info to find LogPrior values
|
| 21 |
+
rnn = Path('oneocr_extracted/config_data/chunk_36_rnn_info.rnn_info').read_text()
|
| 22 |
+
rnn_lines = rnn.strip().split('\n')
|
| 23 |
+
lp_count = int(rnn_lines[0].split()[-1])
|
| 24 |
+
print(f'\nLogPrior count from rnn_info: {lp_count}')
|
| 25 |
+
lp_val = float(rnn_lines[1])
|
| 26 |
+
print(f'LogPrior[0] = {lp_val}')
|
| 27 |
+
|
| 28 |
+
lp_f32 = struct.pack('<f', np.float32(lp_val))
|
| 29 |
+
lp_f64 = struct.pack('<d', lp_val)
|
| 30 |
+
pos_f32 = raw.find(lp_f32)
|
| 31 |
+
pos_f64 = raw.find(lp_f64)
|
| 32 |
+
print(f'LogPrior as float32 at pos: {pos_f32}')
|
| 33 |
+
print(f'LogPrior as float64 at pos: {pos_f64}')
|
| 34 |
+
|
| 35 |
+
# Just look at data structure sections
|
| 36 |
+
# Check for repeating patterns, zeros, etc.
|
| 37 |
+
arr_f32 = np.frombuffer(raw, dtype=np.float32)
|
| 38 |
+
|
| 39 |
+
# Find sections of "reasonable" float values
|
| 40 |
+
reasonable = (np.abs(arr_f32) < 20) & (arr_f32 != 0)
|
| 41 |
+
transitions = np.diff(reasonable.astype(int))
|
| 42 |
+
starts = np.where(transitions == 1)[0] + 1
|
| 43 |
+
ends = np.where(transitions == -1)[0] + 1
|
| 44 |
+
|
| 45 |
+
print(f'\nSections of reasonable float32 values:')
|
| 46 |
+
for s, e in zip(starts[:10], ends[:10]):
|
| 47 |
+
print(f' [{s}:{e}] ({e-s} values) first: {arr_f32[s:s+3]}')
|
| 48 |
+
|
| 49 |
+
# Check if first few bytes are a header
|
| 50 |
+
header_ints = struct.unpack_from('<8I', raw, 0)
|
| 51 |
+
print(f'\nFirst 8 uint32: {header_ints}')
|
| 52 |
+
|
| 53 |
+
header_shorts = struct.unpack_from('<16H', raw, 0)
|
| 54 |
+
print(f'First 16 uint16: {header_shorts}')
|
| 55 |
+
|
| 56 |
+
# Maybe it's a rnn_info-like structure embedded
|
| 57 |
+
# The rnn_info has sections: <LogPrior>, <TransMat>, <LmSmall>/<LmMedium>
|
| 58 |
+
# Let's check the rnn_info structure fully
|
| 59 |
+
print('\n=== rnn_info structure ===')
|
| 60 |
+
section = None
|
| 61 |
+
counts = {}
|
| 62 |
+
for line in rnn_lines:
|
| 63 |
+
if line.startswith('<') and line.endswith('>'):
|
| 64 |
+
section = line
|
| 65 |
+
elif line.startswith('<') and '>' in line:
|
| 66 |
+
parts = line.strip().split()
|
| 67 |
+
section = parts[0].rstrip('>')+'>'
|
| 68 |
+
count = int(parts[-1]) if len(parts) > 1 else 0
|
| 69 |
+
counts[section] = count
|
| 70 |
+
print(f'Section: {section} count={count}')
|
| 71 |
+
else:
|
| 72 |
+
if section and section not in counts:
|
| 73 |
+
counts[section] = 0
|
| 74 |
+
print(f'Sections found: {counts}')
|
_archive/dedup.py
ADDED
|
@@ -0,0 +1,687 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smart OCR deduplication — stabilization-first approach.
|
| 2 |
+
|
| 3 |
+
Core principle: **don't read text until it STOPS CHANGING**.
|
| 4 |
+
Then check against read history to avoid repeats.
|
| 5 |
+
|
| 6 |
+
Architecture:
|
| 7 |
+
|
| 8 |
+
Phase 1 — **Snapshot Stabilization**
|
| 9 |
+
Each tick compares the full OCR output (all regions merged) with the
|
| 10 |
+
previous tick. If text is growing (typewriter effect), we wait.
|
| 11 |
+
Only when the snapshot is identical for ``stabilize_ticks`` consecutive
|
| 12 |
+
ticks do we consider it "stable" and proceed.
|
| 13 |
+
|
| 14 |
+
Phase 2 — **Line History Dedup**
|
| 15 |
+
Once stable, each line is fuzzy-compared against a history of previously
|
| 16 |
+
emitted lines. Only genuinely new lines pass through. History entries
|
| 17 |
+
expire via TTL so the same text can be re-read after a cooldown.
|
| 18 |
+
|
| 19 |
+
Phase 3 — **Significance Check**
|
| 20 |
+
Rejects composed output that is too short, has too few real words,
|
| 21 |
+
or is mostly non-alphanumeric (OCR garbage / UI artifacts).
|
| 22 |
+
|
| 23 |
+
This naturally handles:
|
| 24 |
+
- **Typewriter effects**: text grows → wait → stabilize → read complete sentence
|
| 25 |
+
- **Static UI** (HP bars, names): stabilizes → read once → in history → skip
|
| 26 |
+
- **OCR noise**: fuzzy matching tolerates minor variations
|
| 27 |
+
- **Dialog changes**: snapshot changes → re-stabilize → emit new parts only
|
| 28 |
+
- **Repeated dialog**: TTL expiry allows re-reading after cooldown
|
| 29 |
+
|
| 30 |
+
Usage::
|
| 31 |
+
|
| 32 |
+
from src.services.ocr.dedup import SmartDedup
|
| 33 |
+
|
| 34 |
+
dedup = SmartDedup()
|
| 35 |
+
text = dedup.process(region_labels, ocr_results)
|
| 36 |
+
if text is not None:
|
| 37 |
+
translate_and_speak(text)
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
from __future__ import annotations
|
| 41 |
+
|
| 42 |
+
import time
|
| 43 |
+
from collections import deque
|
| 44 |
+
from dataclasses import dataclass
|
| 45 |
+
from difflib import SequenceMatcher
|
| 46 |
+
|
| 47 |
+
from src.services.ocr.models import OcrResult
|
| 48 |
+
from src.utils.logger import logger
|
| 49 |
+
|
| 50 |
+
# ── Constants (sensible defaults) ────────────────────────────────
|
| 51 |
+
|
| 52 |
+
DEFAULT_STABILIZE_TICKS: int = 3
|
| 53 |
+
DEFAULT_SNAPSHOT_SIMILARITY: float = 0.92
|
| 54 |
+
DEFAULT_LINE_SIMILARITY: float = 0.80
|
| 55 |
+
DEFAULT_LINE_TTL: float = 120.0
|
| 56 |
+
DEFAULT_HISTORY_TTL: float = 90.0
|
| 57 |
+
DEFAULT_HISTORY_SIZE: int = 30
|
| 58 |
+
DEFAULT_MIN_NEW_CHARS: int = 8
|
| 59 |
+
DEFAULT_MIN_NEW_WORDS: int = 2
|
| 60 |
+
DEFAULT_MIN_ALNUM_RATIO: float = 0.35
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ── Data classes ─────────────────────────────────────────────────
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class HistoryEntry:
|
| 68 |
+
"""An entry in the global text history ring buffer."""
|
| 69 |
+
|
| 70 |
+
norm_text: str
|
| 71 |
+
original_text: str
|
| 72 |
+
first_seen: float
|
| 73 |
+
last_seen: float
|
| 74 |
+
hit_count: int = 1
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass
|
| 78 |
+
class DedupConfig:
|
| 79 |
+
"""All tunable knobs for the dedup system.
|
| 80 |
+
|
| 81 |
+
Attributes:
|
| 82 |
+
stabilize_ticks: Consecutive identical ticks before text is considered "stable".
|
| 83 |
+
snapshot_similarity: Fuzzy threshold for treating two snapshots as identical (0-1).
|
| 84 |
+
line_similarity: Fuzzy threshold for line-level history matching (0-1).
|
| 85 |
+
line_ttl: Seconds before a known line in history expires.
|
| 86 |
+
history_ttl: Seconds before a global history entry expires.
|
| 87 |
+
history_size: Max entries in the global history ring buffer.
|
| 88 |
+
history_similarity: Alias for line_similarity (backward compat with bridge.py).
|
| 89 |
+
min_new_chars: Minimum characters for a change to be significant.
|
| 90 |
+
min_new_words: Minimum word count for significance.
|
| 91 |
+
min_alnum_ratio: Minimum alphanumeric ratio for significance.
|
| 92 |
+
debounce_time: Legacy field — not used internally, kept for bridge compat.
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
stabilize_ticks: int = DEFAULT_STABILIZE_TICKS
|
| 96 |
+
snapshot_similarity: float = DEFAULT_SNAPSHOT_SIMILARITY
|
| 97 |
+
line_similarity: float = DEFAULT_LINE_SIMILARITY
|
| 98 |
+
line_ttl: float = DEFAULT_LINE_TTL
|
| 99 |
+
history_ttl: float = DEFAULT_HISTORY_TTL
|
| 100 |
+
history_size: int = DEFAULT_HISTORY_SIZE
|
| 101 |
+
history_similarity: float = DEFAULT_LINE_SIMILARITY
|
| 102 |
+
min_new_chars: int = DEFAULT_MIN_NEW_CHARS
|
| 103 |
+
min_new_words: int = DEFAULT_MIN_NEW_WORDS
|
| 104 |
+
min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO
|
| 105 |
+
debounce_time: float = 0.0 # legacy — mapped to stabilize_ticks externally
|
| 106 |
+
instant_mode: bool = False # skip stabilization — emit text on first identical tick
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ── Helpers ──────────────────────────────────────────────────────
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _normalize(text: str) -> str:
|
| 113 |
+
"""Collapse whitespace, strip, lowercase — for comparison only."""
|
| 114 |
+
return " ".join(text.split()).strip().lower()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ── Line History ─────────────────────────────────────────────────
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class LineHistory:
|
| 121 |
+
"""Tracks previously emitted lines with TTL-based expiry.
|
| 122 |
+
|
| 123 |
+
Each emitted line is stored (normalized) with a timestamp.
|
| 124 |
+
Old entries expire after ``ttl`` seconds, allowing re-reading.
|
| 125 |
+
Fuzzy matching handles OCR noise on short lines.
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
def __init__(
|
| 129 |
+
self,
|
| 130 |
+
ttl: float = DEFAULT_LINE_TTL,
|
| 131 |
+
similarity: float = DEFAULT_LINE_SIMILARITY,
|
| 132 |
+
) -> None:
|
| 133 |
+
self._entries: dict[str, float] = {} # norm_line → last_emitted_at
|
| 134 |
+
self._ttl = ttl
|
| 135 |
+
self._similarity = similarity
|
| 136 |
+
|
| 137 |
+
def is_known(self, line: str) -> bool:
|
| 138 |
+
"""Check if a line was emitted recently (within TTL).
|
| 139 |
+
|
| 140 |
+
Uses exact match first, then fuzzy for short lines.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
line: Raw (non-normalized) line text.
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
True if line is in recent history (should be skipped).
|
| 147 |
+
"""
|
| 148 |
+
norm = _normalize(line)
|
| 149 |
+
if len(norm) < 2:
|
| 150 |
+
return True # too short → treat as known (skip garbage)
|
| 151 |
+
|
| 152 |
+
now = time.monotonic()
|
| 153 |
+
self._gc(now)
|
| 154 |
+
|
| 155 |
+
# Fast path: exact match
|
| 156 |
+
if norm in self._entries:
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
# Slow path: fuzzy match (short lines where OCR noise matters)
|
| 160 |
+
if len(norm) < 60:
|
| 161 |
+
for key in self._entries:
|
| 162 |
+
if abs(len(norm) - len(key)) > max(5, len(key) * 0.25):
|
| 163 |
+
continue
|
| 164 |
+
ratio = SequenceMatcher(None, norm, key).ratio()
|
| 165 |
+
if ratio >= self._similarity:
|
| 166 |
+
return True
|
| 167 |
+
|
| 168 |
+
return False
|
| 169 |
+
|
| 170 |
+
def mark_emitted(self, line: str) -> None:
|
| 171 |
+
"""Record a line as emitted."""
|
| 172 |
+
norm = _normalize(line)
|
| 173 |
+
if norm:
|
| 174 |
+
self._entries[norm] = time.monotonic()
|
| 175 |
+
|
| 176 |
+
def reset(self) -> None:
|
| 177 |
+
"""Clear all history."""
|
| 178 |
+
self._entries.clear()
|
| 179 |
+
|
| 180 |
+
@property
|
| 181 |
+
def size(self) -> int:
|
| 182 |
+
return len(self._entries)
|
| 183 |
+
|
| 184 |
+
def _gc(self, now: float) -> None:
|
| 185 |
+
"""Remove entries older than TTL."""
|
| 186 |
+
expired = [k for k, ts in self._entries.items() if now - ts > self._ttl]
|
| 187 |
+
for k in expired:
|
| 188 |
+
del self._entries[k]
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ── Global Text History (ring buffer for full text blocks) ───────
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class GlobalTextHistory:
|
| 195 |
+
"""Ring buffer of recently emitted text blocks with TTL.
|
| 196 |
+
|
| 197 |
+
Prevents the same composed text from being re-emitted within
|
| 198 |
+
the TTL window. Uses fuzzy matching to handle OCR noise.
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
def __init__(
|
| 202 |
+
self,
|
| 203 |
+
max_size: int = DEFAULT_HISTORY_SIZE,
|
| 204 |
+
ttl: float = DEFAULT_HISTORY_TTL,
|
| 205 |
+
similarity: float = DEFAULT_LINE_SIMILARITY,
|
| 206 |
+
) -> None:
|
| 207 |
+
self._entries: deque[HistoryEntry] = deque(maxlen=max_size)
|
| 208 |
+
self._ttl = ttl
|
| 209 |
+
self._similarity = similarity
|
| 210 |
+
|
| 211 |
+
def is_duplicate(self, text: str) -> tuple[bool, float]:
|
| 212 |
+
"""Check whether text duplicates something in recent history.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
text: Composed text block.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
``(is_dup, best_similarity)``
|
| 219 |
+
"""
|
| 220 |
+
now = time.monotonic()
|
| 221 |
+
norm = _normalize(text)
|
| 222 |
+
if not norm:
|
| 223 |
+
return (True, 1.0)
|
| 224 |
+
|
| 225 |
+
best_sim = 0.0
|
| 226 |
+
for entry in self._entries:
|
| 227 |
+
if now - entry.last_seen > self._ttl:
|
| 228 |
+
continue
|
| 229 |
+
|
| 230 |
+
if entry.norm_text == norm:
|
| 231 |
+
entry.last_seen = now
|
| 232 |
+
entry.hit_count += 1
|
| 233 |
+
return (True, 1.0)
|
| 234 |
+
|
| 235 |
+
ratio = SequenceMatcher(None, norm, entry.norm_text).ratio()
|
| 236 |
+
best_sim = max(best_sim, ratio)
|
| 237 |
+
if ratio >= self._similarity:
|
| 238 |
+
entry.last_seen = now
|
| 239 |
+
entry.hit_count += 1
|
| 240 |
+
return (True, ratio)
|
| 241 |
+
|
| 242 |
+
return (False, best_sim)
|
| 243 |
+
|
| 244 |
+
def add(self, text: str) -> None:
|
| 245 |
+
"""Record a new text block in history."""
|
| 246 |
+
norm = _normalize(text)
|
| 247 |
+
now = time.monotonic()
|
| 248 |
+
self._entries.append(
|
| 249 |
+
HistoryEntry(
|
| 250 |
+
norm_text=norm,
|
| 251 |
+
original_text=text,
|
| 252 |
+
first_seen=now,
|
| 253 |
+
last_seen=now,
|
| 254 |
+
)
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
def reset(self) -> None:
|
| 258 |
+
self._entries.clear()
|
| 259 |
+
|
| 260 |
+
@property
|
| 261 |
+
def size(self) -> int:
|
| 262 |
+
return len(self._entries)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
# ── Significance Check ───────────────────────────────────────────
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
class ChangeDetector:
|
| 269 |
+
"""Decide whether new lines constitute a meaningful change.
|
| 270 |
+
|
| 271 |
+
Rejects very short text, too few words, or mostly non-alphanumeric content.
|
| 272 |
+
"""
|
| 273 |
+
|
| 274 |
+
def __init__(
|
| 275 |
+
self,
|
| 276 |
+
min_chars: int = DEFAULT_MIN_NEW_CHARS,
|
| 277 |
+
min_words: int = DEFAULT_MIN_NEW_WORDS,
|
| 278 |
+
min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO,
|
| 279 |
+
) -> None:
|
| 280 |
+
self._min_chars = min_chars
|
| 281 |
+
self._min_words = min_words
|
| 282 |
+
self._min_alnum_ratio = min_alnum_ratio
|
| 283 |
+
|
| 284 |
+
def is_significant(self, new_lines: list[str]) -> bool:
|
| 285 |
+
"""Return True if the new lines represent real content, not OCR garbage."""
|
| 286 |
+
text = " ".join(line.strip() for line in new_lines).strip()
|
| 287 |
+
|
| 288 |
+
if len(text) < self._min_chars:
|
| 289 |
+
return False
|
| 290 |
+
|
| 291 |
+
words = text.split()
|
| 292 |
+
if len(words) < self._min_words:
|
| 293 |
+
return False
|
| 294 |
+
|
| 295 |
+
alnum = sum(1 for c in text if c.isalnum())
|
| 296 |
+
ratio = alnum / len(text) if text else 0
|
| 297 |
+
if ratio < self._min_alnum_ratio:
|
| 298 |
+
return False
|
| 299 |
+
|
| 300 |
+
return True
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ── Main Facade: SmartDedup ──────────────────────────────────────
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
class SmartDedup:
|
| 307 |
+
"""Stabilization-first OCR deduplication.
|
| 308 |
+
|
| 309 |
+
Core algorithm:
|
| 310 |
+
|
| 311 |
+
1. Each tick: merge all OCR results into a single text snapshot
|
| 312 |
+
2. Compare snapshot with previous tick — growing? same? different?
|
| 313 |
+
3. When snapshot is identical for ``stabilize_ticks`` consecutive ticks → STABLE
|
| 314 |
+
4. Extract lines, filter against read history → emit only NEW lines
|
| 315 |
+
5. Significance check → reject OCR garbage
|
| 316 |
+
6. Add emitted lines to history, record in global ring buffer
|
| 317 |
+
|
| 318 |
+
This replaces the old per-line-tracker approach which caused:
|
| 319 |
+
- Sentence fragments (read partial text too early)
|
| 320 |
+
- Infinite silence (partial lines marked "known" too aggressively)
|
| 321 |
+
|
| 322 |
+
Example::
|
| 323 |
+
|
| 324 |
+
dedup = SmartDedup()
|
| 325 |
+
|
| 326 |
+
# On each pipeline tick:
|
| 327 |
+
text = dedup.process(region_labels, ocr_results)
|
| 328 |
+
if text is not None:
|
| 329 |
+
await translate_and_speak(text)
|
| 330 |
+
|
| 331 |
+
# On pipeline stop or config change:
|
| 332 |
+
dedup.reset()
|
| 333 |
+
"""
|
| 334 |
+
|
| 335 |
+
def __init__(self, config: DedupConfig | None = None) -> None:
|
| 336 |
+
self._cfg = config or DedupConfig()
|
| 337 |
+
|
| 338 |
+
# Stabilization state
|
| 339 |
+
self._last_snapshot: str | None = None
|
| 340 |
+
self._last_raw: str | None = None
|
| 341 |
+
self._stable_count: int = 0
|
| 342 |
+
self._processed_snapshot: str | None = None
|
| 343 |
+
|
| 344 |
+
# Why: track last emitted text to detect post-emit growth
|
| 345 |
+
# (e.g. we emitted 2 lines, then lines 3-4 appear → continuation, not new text)
|
| 346 |
+
self._last_emitted_norm: str | None = None
|
| 347 |
+
|
| 348 |
+
# History layers
|
| 349 |
+
self._line_history = LineHistory(
|
| 350 |
+
ttl=self._cfg.line_ttl,
|
| 351 |
+
similarity=self._cfg.line_similarity,
|
| 352 |
+
)
|
| 353 |
+
self._global_history = GlobalTextHistory(
|
| 354 |
+
max_size=self._cfg.history_size,
|
| 355 |
+
ttl=self._cfg.history_ttl,
|
| 356 |
+
similarity=self._cfg.history_similarity,
|
| 357 |
+
)
|
| 358 |
+
self._change_detector = ChangeDetector(
|
| 359 |
+
min_chars=self._cfg.min_new_chars,
|
| 360 |
+
min_words=self._cfg.min_new_words,
|
| 361 |
+
min_alnum_ratio=self._cfg.min_alnum_ratio,
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
# ── Public API ───────────────────────────────────────────────
|
| 365 |
+
|
| 366 |
+
def process(
|
| 367 |
+
self,
|
| 368 |
+
region_labels: list[str],
|
| 369 |
+
ocr_results: list[OcrResult],
|
| 370 |
+
*,
|
| 371 |
+
force: bool = False,
|
| 372 |
+
) -> str | None:
|
| 373 |
+
"""Run stabilization-based dedup on multi-region OCR results.
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
region_labels: Label/ID for each region (for diagnostics).
|
| 377 |
+
ocr_results: OCR result per region (same order as labels).
|
| 378 |
+
force: If True, skip all dedup and return all text immediately.
|
| 379 |
+
|
| 380 |
+
Returns:
|
| 381 |
+
Text to translate + speak, or None if suppressed by dedup.
|
| 382 |
+
"""
|
| 383 |
+
# ── Merge all regions into one snapshot ──
|
| 384 |
+
raw_parts: list[str] = []
|
| 385 |
+
for result in ocr_results:
|
| 386 |
+
if result.error or result.is_empty:
|
| 387 |
+
continue
|
| 388 |
+
text = result.text.strip()
|
| 389 |
+
if text:
|
| 390 |
+
raw_parts.append(text)
|
| 391 |
+
|
| 392 |
+
if not raw_parts:
|
| 393 |
+
return None
|
| 394 |
+
|
| 395 |
+
full_raw = "\n".join(raw_parts)
|
| 396 |
+
full_norm = _normalize(full_raw)
|
| 397 |
+
|
| 398 |
+
if not full_norm or len(full_norm) < 2:
|
| 399 |
+
return None
|
| 400 |
+
|
| 401 |
+
# ── Force read: bypass all dedup ──
|
| 402 |
+
if force:
|
| 403 |
+
self._global_history.add(full_raw)
|
| 404 |
+
self._mark_all_lines_known(full_raw)
|
| 405 |
+
self._last_snapshot = full_norm
|
| 406 |
+
self._last_raw = full_raw
|
| 407 |
+
self._processed_snapshot = full_norm
|
| 408 |
+
self._stable_count = 0
|
| 409 |
+
logger.info("Dedup: force read — emitting %d chars", len(full_raw))
|
| 410 |
+
return full_raw
|
| 411 |
+
|
| 412 |
+
# ── Phase 1: Stabilization check ──
|
| 413 |
+
if self._last_snapshot is None:
|
| 414 |
+
# First tick — record snapshot, wait for next
|
| 415 |
+
self._last_snapshot = full_norm
|
| 416 |
+
self._last_raw = full_raw
|
| 417 |
+
self._stable_count = 0
|
| 418 |
+
self._processed_snapshot = None
|
| 419 |
+
# Why: in instant mode, skip waiting — proceed on the very first tick
|
| 420 |
+
if not self._cfg.instant_mode:
|
| 421 |
+
return None
|
| 422 |
+
|
| 423 |
+
# Compare current snapshot with previous
|
| 424 |
+
snapshot_sim = self._snapshot_similarity(self._last_snapshot, full_norm)
|
| 425 |
+
|
| 426 |
+
if snapshot_sim >= self._cfg.snapshot_similarity:
|
| 427 |
+
# Same (or very similar due to OCR noise) → count toward stability
|
| 428 |
+
self._stable_count += 1
|
| 429 |
+
elif self._is_text_growing(self._last_snapshot, full_norm):
|
| 430 |
+
# Text is expanding (typewriter effect) → reset, keep waiting
|
| 431 |
+
self._stable_count = 0
|
| 432 |
+
self._last_snapshot = full_norm
|
| 433 |
+
self._last_raw = full_raw
|
| 434 |
+
self._processed_snapshot = None
|
| 435 |
+
logger.debug("Dedup: text growing, waiting for stabilization")
|
| 436 |
+
return None
|
| 437 |
+
elif (
|
| 438 |
+
self._last_emitted_norm is not None
|
| 439 |
+
and self._is_text_growing(self._last_emitted_norm, full_norm)
|
| 440 |
+
):
|
| 441 |
+
# Why: post-emit growth — we emitted lines 1-2, now lines 1-4 are visible.
|
| 442 |
+
# The new snapshot is a SUPERSET of what we emitted → continuation.
|
| 443 |
+
# Reset stability and wait for the full text to settle.
|
| 444 |
+
self._stable_count = 0
|
| 445 |
+
self._last_snapshot = full_norm
|
| 446 |
+
self._last_raw = full_raw
|
| 447 |
+
self._processed_snapshot = None
|
| 448 |
+
logger.debug("Dedup: post-emit growth detected, waiting for continuation")
|
| 449 |
+
return None
|
| 450 |
+
else:
|
| 451 |
+
# Completely different content → new text, start fresh
|
| 452 |
+
self._stable_count = 0
|
| 453 |
+
self._last_snapshot = full_norm
|
| 454 |
+
self._last_raw = full_raw
|
| 455 |
+
self._processed_snapshot = None
|
| 456 |
+
logger.debug("Dedup: snapshot changed, waiting for stabilization")
|
| 457 |
+
return None
|
| 458 |
+
|
| 459 |
+
# Update raw text (keep latest version even during stability counting)
|
| 460 |
+
self._last_snapshot = full_norm
|
| 461 |
+
self._last_raw = full_raw
|
| 462 |
+
|
| 463 |
+
# Not stable yet?
|
| 464 |
+
required_ticks = 1 if self._cfg.instant_mode else self._cfg.stabilize_ticks
|
| 465 |
+
if self._stable_count < required_ticks:
|
| 466 |
+
return None
|
| 467 |
+
|
| 468 |
+
# ── Already processed this exact snapshot? ──
|
| 469 |
+
if self._processed_snapshot is not None:
|
| 470 |
+
sim = self._snapshot_similarity(full_norm, self._processed_snapshot)
|
| 471 |
+
if sim >= self._cfg.snapshot_similarity:
|
| 472 |
+
return None # already evaluated, nothing new
|
| 473 |
+
|
| 474 |
+
# ── Phase 2: Text is STABLE — extract new lines ──
|
| 475 |
+
all_lines = self._extract_lines(full_raw, ocr_results)
|
| 476 |
+
new_lines: list[str] = []
|
| 477 |
+
|
| 478 |
+
for line in all_lines:
|
| 479 |
+
if not self._line_history.is_known(line):
|
| 480 |
+
new_lines.append(line)
|
| 481 |
+
|
| 482 |
+
# Also check against global text history (full text block dedup)
|
| 483 |
+
if new_lines:
|
| 484 |
+
composed = "\n".join(new_lines)
|
| 485 |
+
is_dup, sim = self._global_history.is_duplicate(composed)
|
| 486 |
+
if is_dup:
|
| 487 |
+
logger.debug("Dedup: global history match (sim=%.3f)", sim)
|
| 488 |
+
new_lines = []
|
| 489 |
+
|
| 490 |
+
if not new_lines:
|
| 491 |
+
# All lines already known — mark snapshot as processed
|
| 492 |
+
self._processed_snapshot = full_norm
|
| 493 |
+
return None
|
| 494 |
+
|
| 495 |
+
# ── Phase 3: Significance check ──
|
| 496 |
+
if not self._change_detector.is_significant(new_lines):
|
| 497 |
+
logger.debug(
|
| 498 |
+
"Dedup: new lines not significant (%d lines, %d chars)",
|
| 499 |
+
len(new_lines),
|
| 500 |
+
sum(len(line) for line in new_lines),
|
| 501 |
+
)
|
| 502 |
+
self._processed_snapshot = full_norm
|
| 503 |
+
return None
|
| 504 |
+
|
| 505 |
+
# ── EMIT! ──
|
| 506 |
+
composed = "\n".join(new_lines)
|
| 507 |
+
self._mark_all_lines_known(composed)
|
| 508 |
+
self._global_history.add(composed)
|
| 509 |
+
self._processed_snapshot = full_norm
|
| 510 |
+
# Why: track what we emitted so we can detect post-emit growth
|
| 511 |
+
self._last_emitted_norm = full_norm
|
| 512 |
+
# Why: reset stable_count to prevent immediate re-emit on next tick
|
| 513 |
+
self._stable_count = 0
|
| 514 |
+
|
| 515 |
+
logger.info(
|
| 516 |
+
"Dedup: emitting %d new lines (%d chars, %d known lines in history)",
|
| 517 |
+
len(new_lines),
|
| 518 |
+
len(composed),
|
| 519 |
+
self._line_history.size,
|
| 520 |
+
)
|
| 521 |
+
return composed
|
| 522 |
+
|
| 523 |
+
def force_flush(self) -> str | None:
|
| 524 |
+
"""Force-emit whatever raw text is pending (for force-read button)."""
|
| 525 |
+
if self._last_raw:
|
| 526 |
+
raw = self._last_raw
|
| 527 |
+
self._global_history.add(raw)
|
| 528 |
+
self._mark_all_lines_known(raw)
|
| 529 |
+
return raw
|
| 530 |
+
return None
|
| 531 |
+
|
| 532 |
+
def update_config(self, config: DedupConfig) -> None:
|
| 533 |
+
"""Apply new configuration. Rebuilds internal components."""
|
| 534 |
+
self._cfg = config
|
| 535 |
+
self._line_history = LineHistory(
|
| 536 |
+
ttl=config.line_ttl,
|
| 537 |
+
similarity=config.line_similarity,
|
| 538 |
+
)
|
| 539 |
+
self._global_history = GlobalTextHistory(
|
| 540 |
+
max_size=config.history_size,
|
| 541 |
+
ttl=config.history_ttl,
|
| 542 |
+
similarity=config.history_similarity,
|
| 543 |
+
)
|
| 544 |
+
self._change_detector = ChangeDetector(
|
| 545 |
+
min_chars=config.min_new_chars,
|
| 546 |
+
min_words=config.min_new_words,
|
| 547 |
+
min_alnum_ratio=config.min_alnum_ratio,
|
| 548 |
+
)
|
| 549 |
+
logger.info("SmartDedup: config updated")
|
| 550 |
+
|
| 551 |
+
def reset(self) -> None:
|
| 552 |
+
"""Clear all state (e.g. on scene change or pipeline restart)."""
|
| 553 |
+
self._last_snapshot = None
|
| 554 |
+
self._last_raw = None
|
| 555 |
+
self._stable_count = 0
|
| 556 |
+
self._processed_snapshot = None
|
| 557 |
+
self._last_emitted_norm = None
|
| 558 |
+
self._line_history.reset()
|
| 559 |
+
self._global_history.reset()
|
| 560 |
+
logger.info("SmartDedup: all state reset")
|
| 561 |
+
|
| 562 |
+
def reset_region(self, label: str) -> None:
|
| 563 |
+
"""No-op in snapshot-based approach — kept for backward compat."""
|
| 564 |
+
pass
|
| 565 |
+
|
| 566 |
+
@property
|
| 567 |
+
def stats(self) -> dict[str, int]:
|
| 568 |
+
"""Return diagnostic stats."""
|
| 569 |
+
return {
|
| 570 |
+
"tracked_regions": 0,
|
| 571 |
+
"total_known_lines": self._line_history.size,
|
| 572 |
+
"history_size": self._global_history.size,
|
| 573 |
+
"stable_count": self._stable_count,
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
# ── Internal ─────────────────────────────────────────────────
|
| 577 |
+
|
| 578 |
+
@staticmethod
|
| 579 |
+
def _snapshot_similarity(a: str, b: str) -> float:
|
| 580 |
+
"""Fast similarity between two normalized snapshots."""
|
| 581 |
+
if a == b:
|
| 582 |
+
return 1.0
|
| 583 |
+
if not a or not b:
|
| 584 |
+
return 0.0
|
| 585 |
+
return SequenceMatcher(None, a, b).ratio()
|
| 586 |
+
|
| 587 |
+
@staticmethod
|
| 588 |
+
def _is_text_growing(old_norm: str, new_norm: str) -> bool:
|
| 589 |
+
"""Check if new text is an expansion of old text (typewriter effect).
|
| 590 |
+
|
| 591 |
+
Returns True if new_norm is longer AND contains most of old_norm's
|
| 592 |
+
words at the beginning (prefix-like growth).
|
| 593 |
+
"""
|
| 594 |
+
if len(new_norm) <= len(old_norm):
|
| 595 |
+
return False
|
| 596 |
+
|
| 597 |
+
# Simple prefix check — covers most typewriter cases
|
| 598 |
+
if new_norm.startswith(old_norm):
|
| 599 |
+
return True
|
| 600 |
+
|
| 601 |
+
# Word-level check: old words appear at the start of new word sequence
|
| 602 |
+
old_words = old_norm.split()
|
| 603 |
+
new_words = new_norm.split()
|
| 604 |
+
|
| 605 |
+
if len(new_words) <= len(old_words):
|
| 606 |
+
return False
|
| 607 |
+
|
| 608 |
+
# Count matching words at the beginning
|
| 609 |
+
matching = 0
|
| 610 |
+
for old_w, new_w in zip(old_words, new_words):
|
| 611 |
+
if old_w == new_w:
|
| 612 |
+
matching += 1
|
| 613 |
+
elif SequenceMatcher(None, old_w, new_w).ratio() > 0.8:
|
| 614 |
+
# Why: OCR noise may corrupt already-visible words slightly
|
| 615 |
+
matching += 1
|
| 616 |
+
|
| 617 |
+
# Why: 60% threshold — allows some OCR noise in the matching portion
|
| 618 |
+
return matching >= len(old_words) * 0.6
|
| 619 |
+
|
| 620 |
+
def _extract_lines(
|
| 621 |
+
self, raw_text: str, ocr_results: list[OcrResult]
|
| 622 |
+
) -> list[str]:
|
| 623 |
+
"""Extract individual lines from OCR results.
|
| 624 |
+
|
| 625 |
+
Prefers structured ``OcrResult.lines`` when available.
|
| 626 |
+
Deduplicates across regions (overlapping capture areas).
|
| 627 |
+
|
| 628 |
+
Args:
|
| 629 |
+
raw_text: Fallback raw text (used if no structured lines).
|
| 630 |
+
ocr_results: OCR results with structured lines.
|
| 631 |
+
|
| 632 |
+
Returns:
|
| 633 |
+
List of unique raw line texts.
|
| 634 |
+
"""
|
| 635 |
+
lines: list[str] = []
|
| 636 |
+
seen_norms: set[str] = set()
|
| 637 |
+
|
| 638 |
+
for result in ocr_results:
|
| 639 |
+
if result.error or result.is_empty:
|
| 640 |
+
continue
|
| 641 |
+
for ocr_line in result.lines:
|
| 642 |
+
raw = ocr_line.text.strip()
|
| 643 |
+
if not raw:
|
| 644 |
+
continue
|
| 645 |
+
norm = _normalize(raw)
|
| 646 |
+
if len(norm) < 2:
|
| 647 |
+
continue
|
| 648 |
+
|
| 649 |
+
# Why: skip duplicate lines across regions (overlapping capture areas)
|
| 650 |
+
if norm in seen_norms:
|
| 651 |
+
continue
|
| 652 |
+
|
| 653 |
+
# Fuzzy cross-region dedup for short lines
|
| 654 |
+
# Why: high threshold (0.95) because overlapping regions produce
|
| 655 |
+
# near-identical text, not merely similar text
|
| 656 |
+
is_cross_dup = False
|
| 657 |
+
if len(norm) < 60:
|
| 658 |
+
for seen in seen_norms:
|
| 659 |
+
if abs(len(norm) - len(seen)) > 3:
|
| 660 |
+
continue
|
| 661 |
+
if SequenceMatcher(None, norm, seen).ratio() >= 0.95:
|
| 662 |
+
is_cross_dup = True
|
| 663 |
+
break
|
| 664 |
+
if is_cross_dup:
|
| 665 |
+
continue
|
| 666 |
+
|
| 667 |
+
seen_norms.add(norm)
|
| 668 |
+
lines.append(raw)
|
| 669 |
+
|
| 670 |
+
# Fallback: if no structured lines, split raw text
|
| 671 |
+
if not lines:
|
| 672 |
+
for line in raw_text.split("\n"):
|
| 673 |
+
stripped = line.strip()
|
| 674 |
+
if stripped and len(_normalize(stripped)) >= 2:
|
| 675 |
+
norm = _normalize(stripped)
|
| 676 |
+
if norm not in seen_norms:
|
| 677 |
+
seen_norms.add(norm)
|
| 678 |
+
lines.append(stripped)
|
| 679 |
+
|
| 680 |
+
return lines
|
| 681 |
+
|
| 682 |
+
def _mark_all_lines_known(self, text: str) -> None:
|
| 683 |
+
"""Add all lines in text to line history."""
|
| 684 |
+
for line in text.split("\n"):
|
| 685 |
+
stripped = line.strip()
|
| 686 |
+
if stripped and len(_normalize(stripped)) >= 2:
|
| 687 |
+
self._line_history.mark_emitted(stripped)
|
_archive/dedup_old.py
ADDED
|
@@ -0,0 +1,595 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smart OCR deduplication — multi-layer heuristic to avoid re-reading the same text.
|
| 2 |
+
|
| 3 |
+
Architecture (3 layers):
|
| 4 |
+
|
| 5 |
+
Layer 1 — **Per-Region Line Tracker**
|
| 6 |
+
Each capture region keeps a dict of known OCR lines (normalized text → metadata).
|
| 7 |
+
New OCR results are compared line-by-line; only genuinely new lines pass through.
|
| 8 |
+
Stale entries expire after ``line_ttl`` seconds.
|
| 9 |
+
|
| 10 |
+
Layer 2 — **Global Text History** (ring buffer)
|
| 11 |
+
After composing new lines into a text block, the block is fuzzy-matched against
|
| 12 |
+
a bounded history of recently emitted texts. TTL-based expiry allows the same
|
| 13 |
+
dialog to be read again after a configurable cooldown.
|
| 14 |
+
|
| 15 |
+
Layer 3 — **Semantic Change Detector**
|
| 16 |
+
Rejects composed text that is too short, has too few real words, or is mostly
|
| 17 |
+
non-alphanumeric (OCR garbage / UI artifacts).
|
| 18 |
+
|
| 19 |
+
Debounce (optional)
|
| 20 |
+
When text grows incrementally (typewriter effect), the emitter waits for
|
| 21 |
+
stabilization before yielding the final text.
|
| 22 |
+
|
| 23 |
+
Usage::
|
| 24 |
+
|
| 25 |
+
from src.services.ocr.dedup import SmartDedup
|
| 26 |
+
|
| 27 |
+
dedup = SmartDedup()
|
| 28 |
+
text = dedup.process(regions, ocr_results)
|
| 29 |
+
if text is not None:
|
| 30 |
+
translate_and_speak(text)
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from __future__ import annotations
|
| 34 |
+
|
| 35 |
+
import time
|
| 36 |
+
from collections import deque
|
| 37 |
+
from dataclasses import dataclass
|
| 38 |
+
from difflib import SequenceMatcher
|
| 39 |
+
|
| 40 |
+
from src.services.ocr.models import OcrResult
|
| 41 |
+
from src.utils.logger import logger
|
| 42 |
+
|
| 43 |
+
# ── Constants (sensible defaults) ────────────────────────────────
|
| 44 |
+
|
| 45 |
+
DEFAULT_LINE_TTL: float = 120.0
|
| 46 |
+
DEFAULT_LINE_SIMILARITY: float = 0.80
|
| 47 |
+
DEFAULT_HISTORY_SIZE: int = 30
|
| 48 |
+
DEFAULT_HISTORY_TTL: float = 90.0
|
| 49 |
+
DEFAULT_HISTORY_SIMILARITY: float = 0.82
|
| 50 |
+
DEFAULT_MIN_NEW_CHARS: int = 8
|
| 51 |
+
DEFAULT_MIN_NEW_WORDS: int = 2
|
| 52 |
+
DEFAULT_MIN_ALNUM_RATIO: float = 0.35
|
| 53 |
+
DEFAULT_DEBOUNCE_TIME: float = 0.0 # 0 = disabled
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ── Data classes ─────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class KnownLine:
|
| 61 |
+
"""A line previously seen by a RegionLineTracker."""
|
| 62 |
+
|
| 63 |
+
text: str
|
| 64 |
+
first_seen: float
|
| 65 |
+
last_seen: float
|
| 66 |
+
hit_count: int = 1
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class HistoryEntry:
|
| 71 |
+
"""An entry in the global text history ring buffer."""
|
| 72 |
+
|
| 73 |
+
norm_text: str
|
| 74 |
+
original_text: str
|
| 75 |
+
first_seen: float
|
| 76 |
+
last_seen: float
|
| 77 |
+
hit_count: int = 1
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@dataclass
|
| 81 |
+
class DedupConfig:
|
| 82 |
+
"""All tunable knobs for the dedup system.
|
| 83 |
+
|
| 84 |
+
Attributes:
|
| 85 |
+
line_ttl: Seconds before a known line expires (Layer 1).
|
| 86 |
+
line_similarity: Fuzzy threshold for line-level dedup (0-1).
|
| 87 |
+
history_size: Max entries in global ring buffer (Layer 2).
|
| 88 |
+
history_ttl: Seconds before a global history entry expires.
|
| 89 |
+
history_similarity: Fuzzy threshold for global dedup (0-1).
|
| 90 |
+
min_new_chars: Minimum characters for a change to be significant (Layer 3).
|
| 91 |
+
min_new_words: Minimum word count for significance.
|
| 92 |
+
min_alnum_ratio: Minimum alphanumeric ratio for significance.
|
| 93 |
+
debounce_time: Seconds to wait for text stabilization (0 = off).
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
line_ttl: float = DEFAULT_LINE_TTL
|
| 97 |
+
line_similarity: float = DEFAULT_LINE_SIMILARITY
|
| 98 |
+
history_size: int = DEFAULT_HISTORY_SIZE
|
| 99 |
+
history_ttl: float = DEFAULT_HISTORY_TTL
|
| 100 |
+
history_similarity: float = DEFAULT_HISTORY_SIMILARITY
|
| 101 |
+
min_new_chars: int = DEFAULT_MIN_NEW_CHARS
|
| 102 |
+
min_new_words: int = DEFAULT_MIN_NEW_WORDS
|
| 103 |
+
min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO
|
| 104 |
+
debounce_time: float = DEFAULT_DEBOUNCE_TIME
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ── Helpers ──────────────────────────────────────────────────────
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _normalize(text: str) -> str:
|
| 111 |
+
"""Collapse whitespace, strip, lowercase — for comparison only."""
|
| 112 |
+
return " ".join(text.split()).strip().lower()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# ── Layer 1: Per-Region Line Tracker ─────────────────────────────
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class RegionLineTracker:
|
| 119 |
+
"""Track known lines for a single capture region.
|
| 120 |
+
|
| 121 |
+
Lines already seen (exact or fuzzy match) are filtered out.
|
| 122 |
+
Entries expire after ``line_ttl`` seconds so the same text
|
| 123 |
+
can be re-read after a cooldown.
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
def __init__(
|
| 127 |
+
self,
|
| 128 |
+
similarity: float = DEFAULT_LINE_SIMILARITY,
|
| 129 |
+
line_ttl: float = DEFAULT_LINE_TTL,
|
| 130 |
+
) -> None:
|
| 131 |
+
self._known: dict[str, KnownLine] = {}
|
| 132 |
+
self._similarity = similarity
|
| 133 |
+
self._line_ttl = line_ttl
|
| 134 |
+
|
| 135 |
+
def extract_new_lines(self, ocr_result: OcrResult) -> list[str]:
|
| 136 |
+
"""Return only lines that are NOT already known.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
ocr_result: OCR result with ``.lines`` populated.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
List of *original* (non-normalized) line texts that are new.
|
| 143 |
+
"""
|
| 144 |
+
now = time.monotonic()
|
| 145 |
+
self._gc(now)
|
| 146 |
+
|
| 147 |
+
new_lines: list[str] = []
|
| 148 |
+
for line in ocr_result.lines:
|
| 149 |
+
raw = line.text.strip()
|
| 150 |
+
if not raw:
|
| 151 |
+
continue
|
| 152 |
+
norm = _normalize(raw)
|
| 153 |
+
if len(norm) < 2:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
# Fast path: exact match
|
| 157 |
+
if norm in self._known:
|
| 158 |
+
self._known[norm].last_seen = now
|
| 159 |
+
self._known[norm].hit_count += 1
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# Slow path: fuzzy match (only short texts where OCR noise matters)
|
| 163 |
+
matched = False
|
| 164 |
+
if len(norm) < 60:
|
| 165 |
+
for key, entry in self._known.items():
|
| 166 |
+
# Skip candidates with very different length
|
| 167 |
+
if abs(len(norm) - len(key)) > max(5, len(key) * 0.2):
|
| 168 |
+
continue
|
| 169 |
+
ratio = SequenceMatcher(None, norm, key).ratio()
|
| 170 |
+
if ratio >= self._similarity:
|
| 171 |
+
entry.last_seen = now
|
| 172 |
+
entry.hit_count += 1
|
| 173 |
+
matched = True
|
| 174 |
+
break
|
| 175 |
+
|
| 176 |
+
if not matched:
|
| 177 |
+
self._known[norm] = KnownLine(
|
| 178 |
+
text=norm, first_seen=now, last_seen=now
|
| 179 |
+
)
|
| 180 |
+
new_lines.append(raw)
|
| 181 |
+
|
| 182 |
+
return new_lines
|
| 183 |
+
|
| 184 |
+
def reset(self) -> None:
|
| 185 |
+
"""Clear all known lines (e.g. on scene change)."""
|
| 186 |
+
self._known.clear()
|
| 187 |
+
|
| 188 |
+
@property
|
| 189 |
+
def known_count(self) -> int:
|
| 190 |
+
"""Number of tracked lines."""
|
| 191 |
+
return len(self._known)
|
| 192 |
+
|
| 193 |
+
def _gc(self, now: float) -> None:
|
| 194 |
+
"""Remove lines not seen for longer than TTL."""
|
| 195 |
+
expired = [
|
| 196 |
+
k for k, v in self._known.items() if now - v.last_seen > self._line_ttl
|
| 197 |
+
]
|
| 198 |
+
for k in expired:
|
| 199 |
+
del self._known[k]
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ── Layer 2: Global Text History ─────────────────────────────────
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class GlobalTextHistory:
|
| 206 |
+
"""Ring buffer of recently emitted text blocks with TTL.
|
| 207 |
+
|
| 208 |
+
Prevents the same composed text from being processed twice
|
| 209 |
+
within the TTL window, even if it comes from different regions
|
| 210 |
+
or after a brief interruption.
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
def __init__(
|
| 214 |
+
self,
|
| 215 |
+
max_size: int = DEFAULT_HISTORY_SIZE,
|
| 216 |
+
ttl: float = DEFAULT_HISTORY_TTL,
|
| 217 |
+
similarity: float = DEFAULT_HISTORY_SIMILARITY,
|
| 218 |
+
) -> None:
|
| 219 |
+
self._entries: deque[HistoryEntry] = deque(maxlen=max_size)
|
| 220 |
+
self._ttl = ttl
|
| 221 |
+
self._similarity = similarity
|
| 222 |
+
|
| 223 |
+
def is_duplicate(self, text: str) -> tuple[bool, float]:
|
| 224 |
+
"""Check whether *text* duplicates something in recent history.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
text: Composed text block (already new-line joined).
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
``(is_dup, best_similarity)`` — whether it matched and how closely.
|
| 231 |
+
"""
|
| 232 |
+
now = time.monotonic()
|
| 233 |
+
norm = _normalize(text)
|
| 234 |
+
if not norm:
|
| 235 |
+
return (True, 1.0) # empty → always "duplicate"
|
| 236 |
+
|
| 237 |
+
best_sim = 0.0
|
| 238 |
+
for entry in self._entries:
|
| 239 |
+
if now - entry.last_seen > self._ttl:
|
| 240 |
+
continue # expired
|
| 241 |
+
|
| 242 |
+
# Fast path: identical normalized text
|
| 243 |
+
if entry.norm_text == norm:
|
| 244 |
+
entry.last_seen = now
|
| 245 |
+
entry.hit_count += 1
|
| 246 |
+
return (True, 1.0)
|
| 247 |
+
|
| 248 |
+
# Fuzzy path
|
| 249 |
+
ratio = SequenceMatcher(None, norm, entry.norm_text).ratio()
|
| 250 |
+
best_sim = max(best_sim, ratio)
|
| 251 |
+
if ratio >= self._similarity:
|
| 252 |
+
entry.last_seen = now
|
| 253 |
+
entry.hit_count += 1
|
| 254 |
+
return (True, ratio)
|
| 255 |
+
|
| 256 |
+
return (False, best_sim)
|
| 257 |
+
|
| 258 |
+
def add(self, text: str) -> None:
|
| 259 |
+
"""Record a new text block in history."""
|
| 260 |
+
norm = _normalize(text)
|
| 261 |
+
now = time.monotonic()
|
| 262 |
+
self._entries.append(
|
| 263 |
+
HistoryEntry(
|
| 264 |
+
norm_text=norm,
|
| 265 |
+
original_text=text,
|
| 266 |
+
first_seen=now,
|
| 267 |
+
last_seen=now,
|
| 268 |
+
)
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
def reset(self) -> None:
|
| 272 |
+
"""Clear all history entries."""
|
| 273 |
+
self._entries.clear()
|
| 274 |
+
|
| 275 |
+
@property
|
| 276 |
+
def size(self) -> int:
|
| 277 |
+
return len(self._entries)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
# ── Layer 3: Semantic Change Detector ────────────────────────────
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
class ChangeDetector:
|
| 284 |
+
"""Decide whether a set of new lines constitutes a meaningful change.
|
| 285 |
+
|
| 286 |
+
Rejects:
|
| 287 |
+
- Very short text (< ``min_chars`` printable characters)
|
| 288 |
+
- Too few words (< ``min_words``)
|
| 289 |
+
- Mostly non-alphanumeric (ratio < ``min_alnum_ratio``)
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
def __init__(
|
| 293 |
+
self,
|
| 294 |
+
min_chars: int = DEFAULT_MIN_NEW_CHARS,
|
| 295 |
+
min_words: int = DEFAULT_MIN_NEW_WORDS,
|
| 296 |
+
min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO,
|
| 297 |
+
) -> None:
|
| 298 |
+
self._min_chars = min_chars
|
| 299 |
+
self._min_words = min_words
|
| 300 |
+
self._min_alnum_ratio = min_alnum_ratio
|
| 301 |
+
|
| 302 |
+
def is_significant(self, new_lines: list[str]) -> bool:
|
| 303 |
+
"""Return ``True`` if the new lines represent a real content change."""
|
| 304 |
+
text = " ".join(line.strip() for line in new_lines).strip()
|
| 305 |
+
|
| 306 |
+
if len(text) < self._min_chars:
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
+
words = text.split()
|
| 310 |
+
if len(words) < self._min_words:
|
| 311 |
+
return False
|
| 312 |
+
|
| 313 |
+
alnum = sum(1 for c in text if c.isalnum())
|
| 314 |
+
ratio = alnum / len(text) if text else 0
|
| 315 |
+
if ratio < self._min_alnum_ratio:
|
| 316 |
+
return False
|
| 317 |
+
|
| 318 |
+
return True
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# ── Debounce Emitter ─────────────────────────────────────────────
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
class DebouncedEmitter:
|
| 325 |
+
"""Buffer text and only yield it after stabilization.
|
| 326 |
+
|
| 327 |
+
Useful for typewriter-effect dialogs where text appears incrementally.
|
| 328 |
+
If ``stabilize_time`` is 0, debouncing is disabled (pass-through).
|
| 329 |
+
"""
|
| 330 |
+
|
| 331 |
+
def __init__(self, stabilize_time: float = DEFAULT_DEBOUNCE_TIME) -> None:
|
| 332 |
+
self._stabilize = stabilize_time
|
| 333 |
+
self._pending: str | None = None
|
| 334 |
+
self._pending_since: float = 0.0
|
| 335 |
+
|
| 336 |
+
def feed(self, text: str) -> str | None:
|
| 337 |
+
"""Feed new text. Returns the text once it has been stable long enough.
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
text: The candidate text to emit.
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
The stabilized text, or ``None`` if still waiting.
|
| 344 |
+
"""
|
| 345 |
+
if self._stabilize <= 0:
|
| 346 |
+
return text # debounce disabled → immediate
|
| 347 |
+
|
| 348 |
+
now = time.monotonic()
|
| 349 |
+
|
| 350 |
+
if self._pending is None or _normalize(text) != _normalize(self._pending):
|
| 351 |
+
# New or changed text → reset timer
|
| 352 |
+
self._pending = text
|
| 353 |
+
self._pending_since = now
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
+
# Text unchanged — check if stable long enough
|
| 357 |
+
if now - self._pending_since >= self._stabilize:
|
| 358 |
+
result = self._pending
|
| 359 |
+
self._pending = None
|
| 360 |
+
return result
|
| 361 |
+
|
| 362 |
+
return None # still waiting
|
| 363 |
+
|
| 364 |
+
def flush(self) -> str | None:
|
| 365 |
+
"""Force-emit whatever is pending (used on pipeline stop / force-read)."""
|
| 366 |
+
result = self._pending
|
| 367 |
+
self._pending = None
|
| 368 |
+
return result
|
| 369 |
+
|
| 370 |
+
def reset(self) -> None:
|
| 371 |
+
"""Discard pending text."""
|
| 372 |
+
self._pending = None
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# ── Cross-Region Dedup Pool ──────────────────────────────────────
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
class CrossRegionPool:
|
| 379 |
+
"""Tracks lines across regions within a single tick to prevent cross-region duplication.
|
| 380 |
+
|
| 381 |
+
Within a single pipeline tick, if region A already yielded line X,
|
| 382 |
+
region B should skip it.
|
| 383 |
+
"""
|
| 384 |
+
|
| 385 |
+
def __init__(self, similarity: float = DEFAULT_LINE_SIMILARITY) -> None:
|
| 386 |
+
self._seen: dict[str, str] = {} # norm → original
|
| 387 |
+
self._similarity = similarity
|
| 388 |
+
|
| 389 |
+
def is_seen(self, line: str) -> bool:
|
| 390 |
+
"""Check if this line was already yielded by another region this tick."""
|
| 391 |
+
norm = _normalize(line)
|
| 392 |
+
if not norm:
|
| 393 |
+
return True
|
| 394 |
+
|
| 395 |
+
# Exact
|
| 396 |
+
if norm in self._seen:
|
| 397 |
+
return True
|
| 398 |
+
|
| 399 |
+
# Fuzzy (short lines only)
|
| 400 |
+
if len(norm) < 60:
|
| 401 |
+
for key in self._seen:
|
| 402 |
+
if abs(len(norm) - len(key)) > max(4, len(key) * 0.2):
|
| 403 |
+
continue
|
| 404 |
+
if SequenceMatcher(None, norm, key).ratio() >= self._similarity:
|
| 405 |
+
return True
|
| 406 |
+
|
| 407 |
+
return False
|
| 408 |
+
|
| 409 |
+
def mark(self, line: str) -> None:
|
| 410 |
+
"""Record a line as yielded this tick."""
|
| 411 |
+
norm = _normalize(line)
|
| 412 |
+
if norm:
|
| 413 |
+
self._seen[norm] = line
|
| 414 |
+
|
| 415 |
+
def clear(self) -> None:
|
| 416 |
+
"""Reset for next tick."""
|
| 417 |
+
self._seen.clear()
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
# ── Main Facade: SmartDedup ──────────────────────────────────────
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
class SmartDedup:
|
| 424 |
+
"""Three-layer OCR deduplication with debounce and cross-region awareness.
|
| 425 |
+
|
| 426 |
+
Replaces the old single-``_last_ocr_text`` comparison in ``bridge.py``.
|
| 427 |
+
|
| 428 |
+
Example::
|
| 429 |
+
|
| 430 |
+
dedup = SmartDedup()
|
| 431 |
+
|
| 432 |
+
# On each pipeline tick:
|
| 433 |
+
text = dedup.process(region_labels, ocr_results)
|
| 434 |
+
if text is not None:
|
| 435 |
+
await translate_and_speak(text)
|
| 436 |
+
|
| 437 |
+
# On pipeline stop or config change:
|
| 438 |
+
dedup.reset()
|
| 439 |
+
"""
|
| 440 |
+
|
| 441 |
+
def __init__(self, config: DedupConfig | None = None) -> None:
|
| 442 |
+
self._cfg = config or DedupConfig()
|
| 443 |
+
self._region_trackers: dict[str, RegionLineTracker] = {}
|
| 444 |
+
self._global_history = GlobalTextHistory(
|
| 445 |
+
max_size=self._cfg.history_size,
|
| 446 |
+
ttl=self._cfg.history_ttl,
|
| 447 |
+
similarity=self._cfg.history_similarity,
|
| 448 |
+
)
|
| 449 |
+
self._change_detector = ChangeDetector(
|
| 450 |
+
min_chars=self._cfg.min_new_chars,
|
| 451 |
+
min_words=self._cfg.min_new_words,
|
| 452 |
+
min_alnum_ratio=self._cfg.min_alnum_ratio,
|
| 453 |
+
)
|
| 454 |
+
self._debouncer = DebouncedEmitter(stabilize_time=self._cfg.debounce_time)
|
| 455 |
+
self._cross_pool = CrossRegionPool(similarity=self._cfg.line_similarity)
|
| 456 |
+
|
| 457 |
+
# ── Public API ───────────────────────────────────────────────
|
| 458 |
+
|
| 459 |
+
def process(
|
| 460 |
+
self,
|
| 461 |
+
region_labels: list[str],
|
| 462 |
+
ocr_results: list[OcrResult],
|
| 463 |
+
*,
|
| 464 |
+
force: bool = False,
|
| 465 |
+
) -> str | None:
|
| 466 |
+
"""Run all dedup layers on multi-region OCR results.
|
| 467 |
+
|
| 468 |
+
Args:
|
| 469 |
+
region_labels: Label/ID for each region (used as tracker key).
|
| 470 |
+
ocr_results: OCR result per region (same order as labels).
|
| 471 |
+
force: If ``True``, skip all dedup and return all text.
|
| 472 |
+
|
| 473 |
+
Returns:
|
| 474 |
+
Text to translate + speak, or ``None`` if dedup suppressed it.
|
| 475 |
+
"""
|
| 476 |
+
if force:
|
| 477 |
+
texts = [r.text.strip() for r in ocr_results if r.text.strip()]
|
| 478 |
+
combined = "\n".join(texts) if texts else None
|
| 479 |
+
if combined:
|
| 480 |
+
self._global_history.add(combined)
|
| 481 |
+
# Also update region trackers so we don't double-read next tick
|
| 482 |
+
for label, result in zip(region_labels, ocr_results):
|
| 483 |
+
tracker = self._get_tracker(label)
|
| 484 |
+
tracker.extract_new_lines(result) # just mark as known
|
| 485 |
+
flushed = self._debouncer.flush()
|
| 486 |
+
return combined
|
| 487 |
+
|
| 488 |
+
# Layer 1: Per-region line tracking + cross-region dedup
|
| 489 |
+
self._cross_pool.clear()
|
| 490 |
+
all_new_lines: list[str] = []
|
| 491 |
+
|
| 492 |
+
for label, result in zip(region_labels, ocr_results):
|
| 493 |
+
if result.error or result.is_empty:
|
| 494 |
+
continue
|
| 495 |
+
tracker = self._get_tracker(label)
|
| 496 |
+
region_new = tracker.extract_new_lines(result)
|
| 497 |
+
|
| 498 |
+
for line in region_new:
|
| 499 |
+
if not self._cross_pool.is_seen(line):
|
| 500 |
+
self._cross_pool.mark(line)
|
| 501 |
+
all_new_lines.append(line)
|
| 502 |
+
|
| 503 |
+
if not all_new_lines:
|
| 504 |
+
return None
|
| 505 |
+
|
| 506 |
+
# Layer 3: Semantic significance check
|
| 507 |
+
if not self._change_detector.is_significant(all_new_lines):
|
| 508 |
+
logger.debug(
|
| 509 |
+
"Dedup: new lines not significant (%d lines, %d chars)",
|
| 510 |
+
len(all_new_lines),
|
| 511 |
+
sum(len(l) for l in all_new_lines),
|
| 512 |
+
)
|
| 513 |
+
return None
|
| 514 |
+
|
| 515 |
+
composed = "\n".join(all_new_lines)
|
| 516 |
+
|
| 517 |
+
# Layer 2: Global history check
|
| 518 |
+
is_dup, sim = self._global_history.is_duplicate(composed)
|
| 519 |
+
if is_dup:
|
| 520 |
+
logger.debug("Dedup: global history match (sim=%.3f)", sim)
|
| 521 |
+
return None
|
| 522 |
+
|
| 523 |
+
# Debounce (typewriter effect protection)
|
| 524 |
+
stabilized = self._debouncer.feed(composed)
|
| 525 |
+
if stabilized is None:
|
| 526 |
+
logger.debug("Dedup: waiting for text stabilization")
|
| 527 |
+
return None
|
| 528 |
+
|
| 529 |
+
# ✅ New, significant, stabilized text — emit!
|
| 530 |
+
self._global_history.add(stabilized)
|
| 531 |
+
return stabilized
|
| 532 |
+
|
| 533 |
+
def force_flush(self) -> str | None:
|
| 534 |
+
"""Force-emit any debounced pending text."""
|
| 535 |
+
pending = self._debouncer.flush()
|
| 536 |
+
if pending:
|
| 537 |
+
self._global_history.add(pending)
|
| 538 |
+
return pending
|
| 539 |
+
|
| 540 |
+
def update_config(self, config: DedupConfig) -> None:
|
| 541 |
+
"""Apply new configuration. Recreates internal components."""
|
| 542 |
+
self._cfg = config
|
| 543 |
+
# Rebuild components with new settings
|
| 544 |
+
self._global_history = GlobalTextHistory(
|
| 545 |
+
max_size=config.history_size,
|
| 546 |
+
ttl=config.history_ttl,
|
| 547 |
+
similarity=config.history_similarity,
|
| 548 |
+
)
|
| 549 |
+
self._change_detector = ChangeDetector(
|
| 550 |
+
min_chars=config.min_new_chars,
|
| 551 |
+
min_words=config.min_new_words,
|
| 552 |
+
min_alnum_ratio=config.min_alnum_ratio,
|
| 553 |
+
)
|
| 554 |
+
self._debouncer = DebouncedEmitter(stabilize_time=config.debounce_time)
|
| 555 |
+
self._cross_pool = CrossRegionPool(similarity=config.line_similarity)
|
| 556 |
+
# Update existing region trackers
|
| 557 |
+
for tracker in self._region_trackers.values():
|
| 558 |
+
tracker._similarity = config.line_similarity
|
| 559 |
+
tracker._line_ttl = config.line_ttl
|
| 560 |
+
|
| 561 |
+
def reset(self) -> None:
|
| 562 |
+
"""Clear all state (e.g. on scene change or pipeline restart)."""
|
| 563 |
+
for tracker in self._region_trackers.values():
|
| 564 |
+
tracker.reset()
|
| 565 |
+
self._global_history.reset()
|
| 566 |
+
self._debouncer.reset()
|
| 567 |
+
self._cross_pool.clear()
|
| 568 |
+
logger.info("SmartDedup: all layers reset")
|
| 569 |
+
|
| 570 |
+
def reset_region(self, label: str) -> None:
|
| 571 |
+
"""Reset a specific region tracker."""
|
| 572 |
+
if label in self._region_trackers:
|
| 573 |
+
self._region_trackers[label].reset()
|
| 574 |
+
|
| 575 |
+
@property
|
| 576 |
+
def stats(self) -> dict[str, int]:
|
| 577 |
+
"""Return diagnostic stats."""
|
| 578 |
+
return {
|
| 579 |
+
"tracked_regions": len(self._region_trackers),
|
| 580 |
+
"total_known_lines": sum(
|
| 581 |
+
t.known_count for t in self._region_trackers.values()
|
| 582 |
+
),
|
| 583 |
+
"history_size": self._global_history.size,
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
# ── Internal ─────────────────────────────────────────────────
|
| 587 |
+
|
| 588 |
+
def _get_tracker(self, label: str) -> RegionLineTracker:
|
| 589 |
+
"""Get or create a line tracker for the given region label."""
|
| 590 |
+
if label not in self._region_trackers:
|
| 591 |
+
self._region_trackers[label] = RegionLineTracker(
|
| 592 |
+
similarity=self._cfg.line_similarity,
|
| 593 |
+
line_ttl=self._cfg.line_ttl,
|
| 594 |
+
)
|
| 595 |
+
return self._region_trackers[label]
|
_archive/hooks/hook_decrypt.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hook BCryptDecrypt using ctypes in-process hooking via DLL detour.
|
| 3 |
+
Instead of Frida, we directly hook BCryptDecrypt's IAT entry in oneocr.dll.
|
| 4 |
+
"""
|
| 5 |
+
import ctypes
|
| 6 |
+
import ctypes.wintypes as wt
|
| 7 |
+
from ctypes import (
|
| 8 |
+
c_int64, c_char_p, c_ubyte, POINTER, byref, Structure,
|
| 9 |
+
c_void_p, c_ulong, c_int32, WINFUNCTYPE, CFUNCTYPE, c_uint8
|
| 10 |
+
)
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import struct
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 17 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 18 |
+
|
| 19 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 20 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 21 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 22 |
+
|
| 23 |
+
# ── Globals to collect intercepted data ──
|
| 24 |
+
intercepted_calls = []
|
| 25 |
+
decrypt_call_num = 0
|
| 26 |
+
|
| 27 |
+
# ── BCryptDecrypt signature ──
|
| 28 |
+
# NTSTATUS BCryptDecrypt(BCRYPT_KEY_HANDLE, PUCHAR pbInput, ULONG cbInput,
|
| 29 |
+
# VOID* pPadding, PUCHAR pbIV, ULONG cbIV, PUCHAR pbOutput,
|
| 30 |
+
# ULONG cbOutput, ULONG* pcbResult, ULONG dwFlags)
|
| 31 |
+
|
| 32 |
+
BCRYPT_DECRYPT_TYPE = WINFUNCTYPE(
|
| 33 |
+
c_ulong, # NTSTATUS return
|
| 34 |
+
c_void_p, # hKey
|
| 35 |
+
c_void_p, # pbInput
|
| 36 |
+
c_ulong, # cbInput
|
| 37 |
+
c_void_p, # pPaddingInfo
|
| 38 |
+
c_void_p, # pbIV
|
| 39 |
+
c_ulong, # cbIV
|
| 40 |
+
c_void_p, # pbOutput
|
| 41 |
+
c_ulong, # cbOutput
|
| 42 |
+
POINTER(c_ulong), # pcbResult
|
| 43 |
+
c_ulong, # dwFlags
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Store original function
|
| 47 |
+
original_bcrypt_decrypt = None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def hooked_bcrypt_decrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 51 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 52 |
+
"""Our hook that intercepts BCryptDecrypt calls."""
|
| 53 |
+
global decrypt_call_num
|
| 54 |
+
|
| 55 |
+
call_num = decrypt_call_num
|
| 56 |
+
decrypt_call_num += 1
|
| 57 |
+
|
| 58 |
+
# Read IV before the call (it may be modified)
|
| 59 |
+
iv_before = None
|
| 60 |
+
if pbIV and cbIV > 0:
|
| 61 |
+
try:
|
| 62 |
+
iv_before = ctypes.string_at(pbIV, cbIV)
|
| 63 |
+
except:
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
# Read encrypted input BEFORE the call
|
| 67 |
+
encrypted_input = None
|
| 68 |
+
if pbInput and cbInput > 0:
|
| 69 |
+
try:
|
| 70 |
+
encrypted_input = ctypes.string_at(pbInput, min(cbInput, 64))
|
| 71 |
+
except:
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
# Call original
|
| 75 |
+
status = original_bcrypt_decrypt(hKey, pbInput, cbInput, pPadding,
|
| 76 |
+
pbIV, cbIV, pbOutput, cbOutput,
|
| 77 |
+
pcbResult, dwFlags)
|
| 78 |
+
|
| 79 |
+
# Get result size
|
| 80 |
+
result_size = 0
|
| 81 |
+
if pcbResult:
|
| 82 |
+
result_size = pcbResult[0]
|
| 83 |
+
|
| 84 |
+
# Read IV after (CFB mode modifies the IV)
|
| 85 |
+
iv_after = None
|
| 86 |
+
if pbIV and cbIV > 0:
|
| 87 |
+
try:
|
| 88 |
+
iv_after = ctypes.string_at(pbIV, cbIV)
|
| 89 |
+
except:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
info = {
|
| 93 |
+
'call': call_num,
|
| 94 |
+
'status': status,
|
| 95 |
+
'cbInput': cbInput,
|
| 96 |
+
'cbIV': cbIV,
|
| 97 |
+
'cbOutput': result_size,
|
| 98 |
+
'dwFlags': dwFlags,
|
| 99 |
+
'iv_before': iv_before.hex() if iv_before else None,
|
| 100 |
+
'iv_after': iv_after.hex() if iv_after else None,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
print(f"[BCryptDecrypt #{call_num}] status={status:#x} "
|
| 104 |
+
f"in={cbInput} out={result_size} iv_len={cbIV} flags={dwFlags}")
|
| 105 |
+
if encrypted_input:
|
| 106 |
+
print(f" Encrypted input[:32]: {encrypted_input[:32].hex()}")
|
| 107 |
+
print(f" pbInput addr: {pbInput:#x}")
|
| 108 |
+
if iv_before:
|
| 109 |
+
print(f" IV before: {iv_before.hex()}")
|
| 110 |
+
if iv_after and iv_after != iv_before:
|
| 111 |
+
print(f" IV after: {iv_after.hex()}")
|
| 112 |
+
|
| 113 |
+
# Save decrypted data
|
| 114 |
+
if status == 0 and result_size > 0 and pbOutput:
|
| 115 |
+
try:
|
| 116 |
+
decrypted = ctypes.string_at(pbOutput, result_size)
|
| 117 |
+
|
| 118 |
+
# Check for magic number
|
| 119 |
+
if len(decrypted) >= 4:
|
| 120 |
+
magic = struct.unpack('<I', decrypted[:4])[0]
|
| 121 |
+
info['magic'] = magic
|
| 122 |
+
print(f" Magic: {magic} | First 32 bytes: {decrypted[:32].hex()}")
|
| 123 |
+
|
| 124 |
+
if magic == 1:
|
| 125 |
+
print(f" *** MAGIC NUMBER == 1 FOUND! ***")
|
| 126 |
+
|
| 127 |
+
# Save to file
|
| 128 |
+
fname = OUTPUT_DIR / f"decrypt_{call_num}_in{cbInput}_out{result_size}.bin"
|
| 129 |
+
fname.write_bytes(decrypted)
|
| 130 |
+
print(f" -> Saved: {fname.name} ({result_size:,} bytes)")
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f" Error reading output: {e}")
|
| 134 |
+
|
| 135 |
+
intercepted_calls.append(info)
|
| 136 |
+
return status
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def hook_iat(dll_handle, target_dll_name, target_func_name, hook_func):
|
| 140 |
+
"""
|
| 141 |
+
Hook a function by patching the Import Address Table (IAT) of a DLL.
|
| 142 |
+
Returns the original function pointer.
|
| 143 |
+
"""
|
| 144 |
+
import pefile
|
| 145 |
+
|
| 146 |
+
# Get the DLL file path
|
| 147 |
+
kernel32 = ctypes.windll.kernel32
|
| 148 |
+
buf = ctypes.create_unicode_buffer(260)
|
| 149 |
+
h = ctypes.c_void_p(dll_handle)
|
| 150 |
+
kernel32.GetModuleFileNameW(h, buf, 260)
|
| 151 |
+
dll_path = buf.value
|
| 152 |
+
|
| 153 |
+
print(f"Analyzing IAT of: {dll_path}")
|
| 154 |
+
|
| 155 |
+
pe = pefile.PE(dll_path)
|
| 156 |
+
|
| 157 |
+
# Find the import
|
| 158 |
+
base_addr = dll_handle
|
| 159 |
+
if hasattr(dll_handle, '_handle'):
|
| 160 |
+
base_addr = dll_handle._handle
|
| 161 |
+
|
| 162 |
+
for entry in pe.DIRECTORY_ENTRY_IMPORT:
|
| 163 |
+
import_name = entry.dll.decode('utf-8', errors='ignore').lower()
|
| 164 |
+
if target_dll_name.lower() not in import_name:
|
| 165 |
+
continue
|
| 166 |
+
|
| 167 |
+
for imp in entry.imports:
|
| 168 |
+
if imp.name and imp.name.decode('utf-8', errors='ignore') == target_func_name:
|
| 169 |
+
# Found it! The IAT entry is at base_addr + imp.address - pe.OPTIONAL_HEADER.ImageBase
|
| 170 |
+
iat_rva = imp.address - pe.OPTIONAL_HEADER.ImageBase
|
| 171 |
+
iat_addr = base_addr + iat_rva
|
| 172 |
+
|
| 173 |
+
print(f"Found {target_func_name} in IAT at RVA={iat_rva:#x}, "
|
| 174 |
+
f"VA={iat_addr:#x}")
|
| 175 |
+
|
| 176 |
+
# Read current value (original function pointer)
|
| 177 |
+
original_ptr = ctypes.c_void_p()
|
| 178 |
+
ctypes.memmove(ctypes.byref(original_ptr), iat_addr, 8)
|
| 179 |
+
print(f"Original function pointer: {original_ptr.value:#x}")
|
| 180 |
+
|
| 181 |
+
# Create callback
|
| 182 |
+
callback = BCRYPT_DECRYPT_TYPE(hook_func)
|
| 183 |
+
callback_ptr = ctypes.cast(callback, c_void_p).value
|
| 184 |
+
|
| 185 |
+
# Make IAT page writable
|
| 186 |
+
old_protect = c_ulong()
|
| 187 |
+
PAGE_READWRITE = 0x04
|
| 188 |
+
kernel32.VirtualProtect(
|
| 189 |
+
ctypes.c_void_p(iat_addr), 8,
|
| 190 |
+
PAGE_READWRITE, ctypes.byref(old_protect)
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# Patch IAT
|
| 194 |
+
new_ptr = ctypes.c_void_p(callback_ptr)
|
| 195 |
+
ctypes.memmove(iat_addr, ctypes.byref(new_ptr), 8)
|
| 196 |
+
|
| 197 |
+
# Restore protection
|
| 198 |
+
kernel32.VirtualProtect(
|
| 199 |
+
ctypes.c_void_p(iat_addr), 8,
|
| 200 |
+
old_protect.value, ctypes.byref(old_protect)
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
print(f"IAT patched! New function pointer: {callback_ptr:#x}")
|
| 204 |
+
|
| 205 |
+
# Create callable from original
|
| 206 |
+
original_func = BCRYPT_DECRYPT_TYPE(original_ptr.value)
|
| 207 |
+
|
| 208 |
+
pe.close()
|
| 209 |
+
return original_func, callback # Return both to prevent GC
|
| 210 |
+
|
| 211 |
+
pe.close()
|
| 212 |
+
return None, None
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def main():
|
| 216 |
+
global original_bcrypt_decrypt
|
| 217 |
+
|
| 218 |
+
print("=" * 70)
|
| 219 |
+
print("IN-PROCESS BCryptDecrypt HOOKING")
|
| 220 |
+
print("=" * 70)
|
| 221 |
+
|
| 222 |
+
# Load DLL
|
| 223 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 224 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 225 |
+
|
| 226 |
+
dll_path = os.path.join(DLL_DIR, "oneocr.dll")
|
| 227 |
+
print(f"Loading: {dll_path}")
|
| 228 |
+
dll = ctypes.WinDLL(dll_path)
|
| 229 |
+
|
| 230 |
+
# Setup function types
|
| 231 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 232 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 233 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, c_ubyte]
|
| 234 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 235 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 236 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 237 |
+
|
| 238 |
+
# Try approach 1: Direct BCryptDecrypt function pointer replacement
|
| 239 |
+
print("\n--- Setting up BCryptDecrypt hook ---")
|
| 240 |
+
|
| 241 |
+
# Get the real BCryptDecrypt
|
| 242 |
+
bcrypt_dll = ctypes.WinDLL("bcrypt")
|
| 243 |
+
real_decrypt_addr = ctypes.cast(
|
| 244 |
+
bcrypt_dll.BCryptDecrypt, c_void_p
|
| 245 |
+
).value
|
| 246 |
+
print(f"Real BCryptDecrypt address: {real_decrypt_addr:#x}")
|
| 247 |
+
|
| 248 |
+
# Instead of IAT patching, let's use a simpler approach:
|
| 249 |
+
# We'll call BCryptDecrypt ourselves to first get a "sizing" call,
|
| 250 |
+
# then intercept the actual decrypt.
|
| 251 |
+
|
| 252 |
+
# Actually, the simplest approach: use a manual detour
|
| 253 |
+
# But let's try IAT patching first if pefile is available
|
| 254 |
+
try:
|
| 255 |
+
import pefile
|
| 256 |
+
print("pefile available, trying IAT hook...")
|
| 257 |
+
|
| 258 |
+
original_bcrypt_decrypt_func, callback_ref = hook_iat(
|
| 259 |
+
dll._handle, 'bcrypt', 'BCryptDecrypt', hooked_bcrypt_decrypt
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
if original_bcrypt_decrypt_func:
|
| 263 |
+
original_bcrypt_decrypt = original_bcrypt_decrypt_func
|
| 264 |
+
print("IAT hook installed successfully!")
|
| 265 |
+
else:
|
| 266 |
+
raise Exception("IAT hook failed - function not found in imports")
|
| 267 |
+
|
| 268 |
+
except ImportError:
|
| 269 |
+
print("pefile not available, installing...")
|
| 270 |
+
os.system("uv pip install pefile")
|
| 271 |
+
import pefile
|
| 272 |
+
|
| 273 |
+
original_bcrypt_decrypt_func, callback_ref = hook_iat(
|
| 274 |
+
dll._handle, 'bcrypt', 'BCryptDecrypt', hooked_bcrypt_decrypt
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
if original_bcrypt_decrypt_func:
|
| 278 |
+
original_bcrypt_decrypt = original_bcrypt_decrypt_func
|
| 279 |
+
else:
|
| 280 |
+
print("ERROR: Could not hook BCryptDecrypt")
|
| 281 |
+
return
|
| 282 |
+
|
| 283 |
+
# Now create the pipeline - this will trigger decryption via our hook
|
| 284 |
+
print("\n--- Creating OCR Pipeline (will trigger BCryptDecrypt) ---")
|
| 285 |
+
|
| 286 |
+
init_options = c_int64()
|
| 287 |
+
ret = dll.CreateOcrInitOptions(byref(init_options))
|
| 288 |
+
print(f"CreateOcrInitOptions: {ret}")
|
| 289 |
+
|
| 290 |
+
ret = dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 291 |
+
print(f"SetUseModelDelayLoad: {ret}")
|
| 292 |
+
|
| 293 |
+
pipeline = c_int64()
|
| 294 |
+
model_buf = ctypes.create_string_buffer(MODEL_PATH.encode())
|
| 295 |
+
key_buf = ctypes.create_string_buffer(KEY)
|
| 296 |
+
|
| 297 |
+
print(f"\nCalling CreateOcrPipeline...")
|
| 298 |
+
print(f"Model: {MODEL_PATH}")
|
| 299 |
+
print(f"Key: {KEY}")
|
| 300 |
+
print()
|
| 301 |
+
|
| 302 |
+
ret = dll.CreateOcrPipeline(model_buf, key_buf, init_options, byref(pipeline))
|
| 303 |
+
|
| 304 |
+
print(f"\nCreateOcrPipeline returned: {ret}")
|
| 305 |
+
print(f"Pipeline handle: {pipeline.value}")
|
| 306 |
+
|
| 307 |
+
# Summary
|
| 308 |
+
print()
|
| 309 |
+
print("=" * 70)
|
| 310 |
+
print("SUMMARY")
|
| 311 |
+
print("=" * 70)
|
| 312 |
+
print(f"Total BCryptDecrypt calls intercepted: {len(intercepted_calls)}")
|
| 313 |
+
|
| 314 |
+
magic_1_files = []
|
| 315 |
+
for info in intercepted_calls:
|
| 316 |
+
if info.get('magic') == 1:
|
| 317 |
+
magic_1_files.append(info)
|
| 318 |
+
|
| 319 |
+
if magic_1_files:
|
| 320 |
+
print(f"\n*** Found {len(magic_1_files)} calls with magic_number == 1! ***")
|
| 321 |
+
for info in magic_1_files:
|
| 322 |
+
print(f" Call #{info['call']}: input={info['cbInput']:,}, "
|
| 323 |
+
f"output={info['cbOutput']:,}")
|
| 324 |
+
|
| 325 |
+
# List saved files
|
| 326 |
+
if OUTPUT_DIR.exists():
|
| 327 |
+
files = sorted(OUTPUT_DIR.glob("decrypt_*.bin"))
|
| 328 |
+
if files:
|
| 329 |
+
print(f"\nSaved {len(files)} decrypted buffers:")
|
| 330 |
+
total = 0
|
| 331 |
+
for f in files:
|
| 332 |
+
sz = f.stat().st_size
|
| 333 |
+
total += sz
|
| 334 |
+
header = open(f, 'rb').read(4)
|
| 335 |
+
magic = struct.unpack('<I', header)[0] if len(header) >= 4 else -1
|
| 336 |
+
marker = " *** MAGIC=1 ***" if magic == 1 else ""
|
| 337 |
+
print(f" {f.name}: {sz:,} bytes (magic={magic}){marker}")
|
| 338 |
+
print(f"Total: {total:,} bytes ({total/1024/1024:.1f} MB)")
|
| 339 |
+
|
| 340 |
+
print("\nDone!")
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
if __name__ == '__main__':
|
| 344 |
+
main()
|
_archive/hooks/hook_full_bcrypt.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Extended BCrypt hook - intercepts ALL BCrypt functions to capture the full
|
| 3 |
+
crypto setup: algorithm provider, properties, and actual key material.
|
| 4 |
+
"""
|
| 5 |
+
import ctypes
|
| 6 |
+
import ctypes.wintypes as wt
|
| 7 |
+
from ctypes import (
|
| 8 |
+
c_int64, c_char_p, c_ubyte, POINTER, byref, Structure,
|
| 9 |
+
c_void_p, c_ulong, c_int32, WINFUNCTYPE, CFUNCTYPE, c_uint8
|
| 10 |
+
)
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import struct
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 17 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 18 |
+
|
| 19 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 20 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 21 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 22 |
+
|
| 23 |
+
# ── Globals ──
|
| 24 |
+
intercepted_bcrypt = []
|
| 25 |
+
decrypt_call_num = 0
|
| 26 |
+
|
| 27 |
+
# ── Function types ──
|
| 28 |
+
# BCryptDecrypt
|
| 29 |
+
BCRYPT_DECRYPT_TYPE = WINFUNCTYPE(
|
| 30 |
+
c_ulong, c_void_p, c_void_p, c_ulong, c_void_p,
|
| 31 |
+
c_void_p, c_ulong, c_void_p, c_ulong, POINTER(c_ulong), c_ulong
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# BCryptOpenAlgorithmProvider(phAlgorithm, pszAlgId, pszImplementation, dwFlags)
|
| 35 |
+
BCRYPT_OPEN_ALG_TYPE = WINFUNCTYPE(
|
| 36 |
+
c_ulong, POINTER(c_void_p), c_void_p, c_void_p, c_ulong
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# BCryptSetProperty(hObject, pszProperty, pbInput, cbInput, dwFlags)
|
| 40 |
+
BCRYPT_SET_PROP_TYPE = WINFUNCTYPE(
|
| 41 |
+
c_ulong, c_void_p, c_void_p, c_void_p, c_ulong, c_ulong
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# BCryptGetProperty(hObject, pszProperty, pbOutput, cbOutput, pcbResult, dwFlags)
|
| 45 |
+
BCRYPT_GET_PROP_TYPE = WINFUNCTYPE(
|
| 46 |
+
c_ulong, c_void_p, c_void_p, c_void_p, c_ulong, POINTER(c_ulong), c_ulong
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# BCryptGenerateSymmetricKey(hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 50 |
+
# pbSecret, cbSecret, dwFlags)
|
| 51 |
+
BCRYPT_GEN_KEY_TYPE = WINFUNCTYPE(
|
| 52 |
+
c_ulong, c_void_p, POINTER(c_void_p), c_void_p, c_ulong,
|
| 53 |
+
c_void_p, c_ulong, c_ulong
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# BCryptImportKey(hAlgorithm, hImportKey, pszBlobType, phKey, pbKeyObject,
|
| 57 |
+
# cbKeyObject, pbInput, cbInput, dwFlags)
|
| 58 |
+
BCRYPT_IMPORT_KEY_TYPE = WINFUNCTYPE(
|
| 59 |
+
c_ulong, c_void_p, c_void_p, c_void_p, POINTER(c_void_p),
|
| 60 |
+
c_void_p, c_ulong, c_void_p, c_ulong, c_ulong
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# BCryptEncrypt - same signature as BCryptDecrypt
|
| 64 |
+
BCRYPT_ENCRYPT_TYPE = WINFUNCTYPE(
|
| 65 |
+
c_ulong, c_void_p, c_void_p, c_ulong, c_void_p,
|
| 66 |
+
c_void_p, c_ulong, c_void_p, c_ulong, POINTER(c_ulong), c_ulong
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Store originals
|
| 70 |
+
orig_decrypt = None
|
| 71 |
+
orig_open_alg = None
|
| 72 |
+
orig_set_prop = None
|
| 73 |
+
orig_get_prop = None
|
| 74 |
+
orig_gen_key = None
|
| 75 |
+
orig_import_key = None
|
| 76 |
+
orig_encrypt = None
|
| 77 |
+
|
| 78 |
+
# Keep callback references alive
|
| 79 |
+
_callback_refs = []
|
| 80 |
+
|
| 81 |
+
# Track key handles -> key material
|
| 82 |
+
key_handle_to_material = {}
|
| 83 |
+
alg_handle_to_name = {}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def read_wstr(ptr):
|
| 87 |
+
"""Read a null-terminated UTF-16LE string from a pointer."""
|
| 88 |
+
if not ptr:
|
| 89 |
+
return "<null>"
|
| 90 |
+
try:
|
| 91 |
+
buf = ctypes.wstring_at(ptr)
|
| 92 |
+
return buf
|
| 93 |
+
except:
|
| 94 |
+
return "<err>"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def hooked_open_alg(phAlgorithm, pszAlgId, pszImplementation, dwFlags):
|
| 98 |
+
alg_name = read_wstr(pszAlgId)
|
| 99 |
+
impl = read_wstr(pszImplementation)
|
| 100 |
+
status = orig_open_alg(phAlgorithm, pszAlgId, pszImplementation, dwFlags)
|
| 101 |
+
handle = phAlgorithm[0] if phAlgorithm else None
|
| 102 |
+
if handle:
|
| 103 |
+
alg_handle_to_name[handle.value if hasattr(handle, 'value') else handle] = alg_name
|
| 104 |
+
print(f"[BCryptOpenAlgorithmProvider] alg={alg_name!r} impl={impl!r} "
|
| 105 |
+
f"flags={dwFlags:#x} -> handle={handle} status={status:#010x}")
|
| 106 |
+
return status
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def hooked_set_prop(hObject, pszProperty, pbInput, cbInput, dwFlags):
|
| 110 |
+
prop_name = read_wstr(pszProperty)
|
| 111 |
+
|
| 112 |
+
# Read property value
|
| 113 |
+
value_repr = ""
|
| 114 |
+
if pbInput and cbInput > 0:
|
| 115 |
+
try:
|
| 116 |
+
raw = ctypes.string_at(pbInput, cbInput)
|
| 117 |
+
# Try as wstring first (for chaining mode etc)
|
| 118 |
+
try:
|
| 119 |
+
value_repr = raw.decode('utf-16-le').rstrip('\x00')
|
| 120 |
+
except:
|
| 121 |
+
value_repr = raw.hex()
|
| 122 |
+
# Also try as DWORD for numeric properties
|
| 123 |
+
if cbInput == 4:
|
| 124 |
+
dword_val = struct.unpack('<I', raw)[0]
|
| 125 |
+
value_repr = f"{value_repr} (dword={dword_val})"
|
| 126 |
+
except:
|
| 127 |
+
value_repr = "<err>"
|
| 128 |
+
|
| 129 |
+
status = orig_set_prop(hObject, pszProperty, pbInput, cbInput, dwFlags)
|
| 130 |
+
h = hObject.value if hasattr(hObject, 'value') else hObject
|
| 131 |
+
alg = alg_handle_to_name.get(h, "?")
|
| 132 |
+
print(f"[BCryptSetProperty] obj={h:#x} ({alg}) prop={prop_name!r} "
|
| 133 |
+
f"value={value_repr!r} size={cbInput} flags={dwFlags:#x} "
|
| 134 |
+
f"-> status={status:#010x}")
|
| 135 |
+
return status
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def hooked_get_prop(hObject, pszProperty, pbOutput, cbOutput, pcbResult, dwFlags):
|
| 139 |
+
prop_name = read_wstr(pszProperty)
|
| 140 |
+
status = orig_get_prop(hObject, pszProperty, pbOutput, cbOutput, pcbResult, dwFlags)
|
| 141 |
+
|
| 142 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 143 |
+
value_repr = ""
|
| 144 |
+
if status == 0 and pbOutput and result_size > 0:
|
| 145 |
+
try:
|
| 146 |
+
raw = ctypes.string_at(pbOutput, result_size)
|
| 147 |
+
if result_size == 4:
|
| 148 |
+
value_repr = f"dword={struct.unpack('<I', raw)[0]}"
|
| 149 |
+
elif result_size <= 64:
|
| 150 |
+
try:
|
| 151 |
+
value_repr = raw.decode('utf-16-le').rstrip('\x00')
|
| 152 |
+
except:
|
| 153 |
+
value_repr = raw.hex()
|
| 154 |
+
else:
|
| 155 |
+
value_repr = f"{result_size} bytes"
|
| 156 |
+
except:
|
| 157 |
+
pass
|
| 158 |
+
|
| 159 |
+
print(f"[BCryptGetProperty] prop={prop_name!r} -> {value_repr!r} "
|
| 160 |
+
f"({result_size} bytes) status={status:#010x}")
|
| 161 |
+
return status
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def hooked_gen_key(hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 165 |
+
pbSecret, cbSecret, dwFlags):
|
| 166 |
+
# Capture the secret key material BEFORE the call
|
| 167 |
+
secret = None
|
| 168 |
+
if pbSecret and cbSecret > 0:
|
| 169 |
+
try:
|
| 170 |
+
secret = ctypes.string_at(pbSecret, cbSecret)
|
| 171 |
+
except:
|
| 172 |
+
pass
|
| 173 |
+
|
| 174 |
+
status = orig_gen_key(hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 175 |
+
pbSecret, cbSecret, dwFlags)
|
| 176 |
+
|
| 177 |
+
key_handle = phKey[0] if phKey else None
|
| 178 |
+
alg_h = hAlgorithm.value if hasattr(hAlgorithm, 'value') else hAlgorithm
|
| 179 |
+
alg = alg_handle_to_name.get(alg_h, "?")
|
| 180 |
+
|
| 181 |
+
print(f"[BCryptGenerateSymmetricKey] alg={alg} secret_len={cbSecret} "
|
| 182 |
+
f"keyObjSize={cbKeyObject} flags={dwFlags:#x} "
|
| 183 |
+
f"-> key={key_handle} status={status:#010x}")
|
| 184 |
+
if secret:
|
| 185 |
+
print(f" Secret bytes: {secret.hex()}")
|
| 186 |
+
print(f" Secret ASCII: {secret!r}")
|
| 187 |
+
if key_handle:
|
| 188 |
+
kh = key_handle.value if hasattr(key_handle, 'value') else key_handle
|
| 189 |
+
key_handle_to_material[kh] = secret
|
| 190 |
+
|
| 191 |
+
return status
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def hooked_import_key(hAlgorithm, hImportKey, pszBlobType, phKey,
|
| 195 |
+
pbKeyObject, cbKeyObject, pbInput, cbInput, dwFlags):
|
| 196 |
+
blob_type = read_wstr(pszBlobType)
|
| 197 |
+
|
| 198 |
+
blob_data = None
|
| 199 |
+
if pbInput and cbInput > 0:
|
| 200 |
+
try:
|
| 201 |
+
blob_data = ctypes.string_at(pbInput, cbInput)
|
| 202 |
+
except:
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
status = orig_import_key(hAlgorithm, hImportKey, pszBlobType, phKey,
|
| 206 |
+
pbKeyObject, cbKeyObject, pbInput, cbInput, dwFlags)
|
| 207 |
+
|
| 208 |
+
key_handle = phKey[0] if phKey else None
|
| 209 |
+
print(f"[BCryptImportKey] blob_type={blob_type!r} blob_size={cbInput} "
|
| 210 |
+
f"flags={dwFlags:#x} -> key={key_handle} status={status:#010x}")
|
| 211 |
+
if blob_data:
|
| 212 |
+
print(f" Blob: {blob_data.hex()}")
|
| 213 |
+
if cbInput > 12:
|
| 214 |
+
magic, ver, key_len = struct.unpack('<III', blob_data[:12])
|
| 215 |
+
key_bytes = blob_data[12:12+key_len]
|
| 216 |
+
print(f" Magic={magic:#x} Ver={ver} KeyLen={key_len}")
|
| 217 |
+
print(f" Key: {key_bytes.hex()}")
|
| 218 |
+
print(f" Key ASCII: {key_bytes!r}")
|
| 219 |
+
|
| 220 |
+
return status
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def hooked_encrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 224 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 225 |
+
status = orig_encrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 226 |
+
pbOutput, cbOutput, pcbResult, dwFlags)
|
| 227 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 228 |
+
print(f"[BCryptEncrypt] in={cbInput} out={result_size} iv_len={cbIV} "
|
| 229 |
+
f"flags={dwFlags:#x} status={status:#010x}")
|
| 230 |
+
return status
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def hooked_bcrypt_decrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 234 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 235 |
+
global decrypt_call_num
|
| 236 |
+
call_num = decrypt_call_num
|
| 237 |
+
decrypt_call_num += 1
|
| 238 |
+
|
| 239 |
+
iv_before = None
|
| 240 |
+
if pbIV and cbIV > 0:
|
| 241 |
+
try:
|
| 242 |
+
iv_before = ctypes.string_at(pbIV, cbIV)
|
| 243 |
+
except:
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
encrypted_input = None
|
| 247 |
+
if pbInput and cbInput > 0:
|
| 248 |
+
try:
|
| 249 |
+
encrypted_input = ctypes.string_at(pbInput, min(cbInput, 64))
|
| 250 |
+
except:
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
status = orig_decrypt(hKey, pbInput, cbInput, pPadding,
|
| 254 |
+
pbIV, cbIV, pbOutput, cbOutput, pcbResult, dwFlags)
|
| 255 |
+
|
| 256 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 257 |
+
|
| 258 |
+
iv_after = None
|
| 259 |
+
if pbIV and cbIV > 0:
|
| 260 |
+
try:
|
| 261 |
+
iv_after = ctypes.string_at(pbIV, cbIV)
|
| 262 |
+
except:
|
| 263 |
+
pass
|
| 264 |
+
|
| 265 |
+
# Check if we know the key material for this handle
|
| 266 |
+
kh = hKey.value if hasattr(hKey, 'value') else hKey
|
| 267 |
+
known_key = key_handle_to_material.get(kh)
|
| 268 |
+
|
| 269 |
+
print(f"[BCryptDecrypt #{call_num}] status={status:#x} "
|
| 270 |
+
f"in={cbInput} out={result_size} iv_len={cbIV} flags={dwFlags}")
|
| 271 |
+
if known_key:
|
| 272 |
+
print(f" Key material: {known_key.hex()}")
|
| 273 |
+
if encrypted_input:
|
| 274 |
+
print(f" Enc input[:32]: {encrypted_input[:32].hex()}")
|
| 275 |
+
if iv_before:
|
| 276 |
+
print(f" IV before: {iv_before.hex()}")
|
| 277 |
+
if iv_after and iv_after != iv_before:
|
| 278 |
+
print(f" IV after: {iv_after.hex()}")
|
| 279 |
+
|
| 280 |
+
if status == 0 and result_size > 0 and pbOutput:
|
| 281 |
+
try:
|
| 282 |
+
decrypted = ctypes.string_at(pbOutput, result_size)
|
| 283 |
+
print(f" Decrypted[:32]: {decrypted[:32].hex()}")
|
| 284 |
+
fname = OUTPUT_DIR / f"decrypt_{call_num}_in{cbInput}_out{result_size}.bin"
|
| 285 |
+
fname.write_bytes(decrypted)
|
| 286 |
+
print(f" -> Saved: {fname.name}")
|
| 287 |
+
except Exception as e:
|
| 288 |
+
print(f" Error: {e}")
|
| 289 |
+
|
| 290 |
+
return status
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def hook_iat_generic(dll_handle, target_dll_name, target_func_name, hook_func, func_type):
|
| 294 |
+
"""Hook a function by patching the IAT. Returns (original_func, callback_ref)."""
|
| 295 |
+
import pefile
|
| 296 |
+
|
| 297 |
+
kernel32 = ctypes.windll.kernel32
|
| 298 |
+
buf = ctypes.create_unicode_buffer(260)
|
| 299 |
+
h = ctypes.c_void_p(dll_handle)
|
| 300 |
+
kernel32.GetModuleFileNameW(h, buf, 260)
|
| 301 |
+
dll_path = buf.value
|
| 302 |
+
|
| 303 |
+
pe = pefile.PE(dll_path)
|
| 304 |
+
base_addr = dll_handle
|
| 305 |
+
|
| 306 |
+
for entry in pe.DIRECTORY_ENTRY_IMPORT:
|
| 307 |
+
import_name = entry.dll.decode('utf-8', errors='ignore').lower()
|
| 308 |
+
if target_dll_name.lower() not in import_name:
|
| 309 |
+
continue
|
| 310 |
+
|
| 311 |
+
for imp in entry.imports:
|
| 312 |
+
if imp.name and imp.name.decode('utf-8', errors='ignore') == target_func_name:
|
| 313 |
+
iat_rva = imp.address - pe.OPTIONAL_HEADER.ImageBase
|
| 314 |
+
iat_addr = base_addr + iat_rva
|
| 315 |
+
|
| 316 |
+
original_ptr = ctypes.c_void_p()
|
| 317 |
+
ctypes.memmove(ctypes.byref(original_ptr), iat_addr, 8)
|
| 318 |
+
|
| 319 |
+
callback = func_type(hook_func)
|
| 320 |
+
callback_ptr = ctypes.cast(callback, c_void_p).value
|
| 321 |
+
|
| 322 |
+
old_protect = c_ulong()
|
| 323 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, 0x04, ctypes.byref(old_protect))
|
| 324 |
+
new_ptr = ctypes.c_void_p(callback_ptr)
|
| 325 |
+
ctypes.memmove(iat_addr, ctypes.byref(new_ptr), 8)
|
| 326 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, old_protect.value, ctypes.byref(old_protect))
|
| 327 |
+
|
| 328 |
+
original_func = func_type(original_ptr.value)
|
| 329 |
+
pe.close()
|
| 330 |
+
print(f" Hooked {target_func_name} at IAT RVA={iat_rva:#x}")
|
| 331 |
+
return original_func, callback
|
| 332 |
+
|
| 333 |
+
pe.close()
|
| 334 |
+
return None, None
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def main():
|
| 338 |
+
global orig_decrypt, orig_open_alg, orig_set_prop, orig_get_prop
|
| 339 |
+
global orig_gen_key, orig_import_key, orig_encrypt
|
| 340 |
+
|
| 341 |
+
print("=" * 70)
|
| 342 |
+
print("EXTENDED BCrypt HOOK - Capturing ALL crypto setup")
|
| 343 |
+
print("=" * 70)
|
| 344 |
+
|
| 345 |
+
# Clean dump dir
|
| 346 |
+
for f in OUTPUT_DIR.glob("decrypt_*.bin"):
|
| 347 |
+
f.unlink()
|
| 348 |
+
|
| 349 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 350 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 351 |
+
|
| 352 |
+
dll_path = os.path.join(DLL_DIR, "oneocr.dll")
|
| 353 |
+
print(f"Loading: {dll_path}")
|
| 354 |
+
dll = ctypes.WinDLL(dll_path)
|
| 355 |
+
|
| 356 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 357 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 358 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, c_ubyte]
|
| 359 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 360 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 361 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 362 |
+
|
| 363 |
+
import pefile # noqa
|
| 364 |
+
|
| 365 |
+
# Hook ALL BCrypt functions
|
| 366 |
+
hooks = [
|
| 367 |
+
('bcrypt', 'BCryptOpenAlgorithmProvider', hooked_open_alg, BCRYPT_OPEN_ALG_TYPE),
|
| 368 |
+
('bcrypt', 'BCryptSetProperty', hooked_set_prop, BCRYPT_SET_PROP_TYPE),
|
| 369 |
+
('bcrypt', 'BCryptGetProperty', hooked_get_prop, BCRYPT_GET_PROP_TYPE),
|
| 370 |
+
('bcrypt', 'BCryptGenerateSymmetricKey', hooked_gen_key, BCRYPT_GEN_KEY_TYPE),
|
| 371 |
+
('bcrypt', 'BCryptImportKey', hooked_import_key, BCRYPT_IMPORT_KEY_TYPE),
|
| 372 |
+
('bcrypt', 'BCryptEncrypt', hooked_encrypt, BCRYPT_ENCRYPT_TYPE),
|
| 373 |
+
('bcrypt', 'BCryptDecrypt', hooked_bcrypt_decrypt, BCRYPT_DECRYPT_TYPE),
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
originals = {}
|
| 377 |
+
print("\n--- Installing IAT hooks ---")
|
| 378 |
+
for target_dll, func_name, hook_func, func_type in hooks:
|
| 379 |
+
orig, cb = hook_iat_generic(dll._handle, target_dll, func_name, hook_func, func_type)
|
| 380 |
+
if orig:
|
| 381 |
+
originals[func_name] = orig
|
| 382 |
+
_callback_refs.append(cb)
|
| 383 |
+
else:
|
| 384 |
+
print(f" WARNING: {func_name} not found in IAT (may not be imported)")
|
| 385 |
+
|
| 386 |
+
orig_open_alg = originals.get('BCryptOpenAlgorithmProvider')
|
| 387 |
+
orig_set_prop = originals.get('BCryptSetProperty')
|
| 388 |
+
orig_get_prop = originals.get('BCryptGetProperty')
|
| 389 |
+
orig_gen_key = originals.get('BCryptGenerateSymmetricKey')
|
| 390 |
+
orig_import_key = originals.get('BCryptImportKey')
|
| 391 |
+
orig_encrypt = originals.get('BCryptEncrypt')
|
| 392 |
+
orig_decrypt = originals.get('BCryptDecrypt')
|
| 393 |
+
|
| 394 |
+
if not orig_decrypt:
|
| 395 |
+
print("FATAL: Could not hook BCryptDecrypt!")
|
| 396 |
+
return
|
| 397 |
+
|
| 398 |
+
print("\n--- Creating OCR Pipeline ---")
|
| 399 |
+
init_options = c_int64()
|
| 400 |
+
ret = dll.CreateOcrInitOptions(byref(init_options))
|
| 401 |
+
print(f"CreateOcrInitOptions: {ret}")
|
| 402 |
+
|
| 403 |
+
ret = dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 404 |
+
print(f"SetUseModelDelayLoad: {ret}")
|
| 405 |
+
|
| 406 |
+
pipeline = c_int64()
|
| 407 |
+
model_buf = ctypes.create_string_buffer(MODEL_PATH.encode())
|
| 408 |
+
key_buf = ctypes.create_string_buffer(KEY)
|
| 409 |
+
|
| 410 |
+
print(f"\nCalling CreateOcrPipeline...")
|
| 411 |
+
print(f"Model: {MODEL_PATH}")
|
| 412 |
+
print(f"Key: {KEY}")
|
| 413 |
+
print()
|
| 414 |
+
|
| 415 |
+
ret = dll.CreateOcrPipeline(model_buf, key_buf, init_options, byref(pipeline))
|
| 416 |
+
|
| 417 |
+
print(f"\nCreateOcrPipeline returned: {ret}")
|
| 418 |
+
print(f"Pipeline handle: {pipeline.value}")
|
| 419 |
+
|
| 420 |
+
# Summary
|
| 421 |
+
print()
|
| 422 |
+
print("=" * 70)
|
| 423 |
+
print("SUMMARY")
|
| 424 |
+
print("=" * 70)
|
| 425 |
+
print(f"Key handles tracked: {len(key_handle_to_material)}")
|
| 426 |
+
for kh, mat in key_handle_to_material.items():
|
| 427 |
+
print(f" Handle {kh:#x}: {mat.hex()}")
|
| 428 |
+
print(f" ASCII: {mat!r}")
|
| 429 |
+
print(f" Length: {len(mat)}")
|
| 430 |
+
|
| 431 |
+
files = sorted(OUTPUT_DIR.glob("decrypt_*.bin"))
|
| 432 |
+
if files:
|
| 433 |
+
print(f"\nSaved {len(files)} decrypted buffers")
|
| 434 |
+
total = sum(f.stat().st_size for f in files)
|
| 435 |
+
print(f"Total: {total:,} bytes ({total/1024/1024:.1f} MB)")
|
| 436 |
+
|
| 437 |
+
print("\nDone!")
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
if __name__ == '__main__':
|
| 441 |
+
main()
|
_archive/hooks/hook_full_log.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Full BCrypt hash hook - saves all hash inputs and AES keys to JSON for analysis.
|
| 3 |
+
"""
|
| 4 |
+
import ctypes
|
| 5 |
+
from ctypes import (
|
| 6 |
+
c_int64, c_char_p, c_ubyte, POINTER, byref,
|
| 7 |
+
c_void_p, c_ulong, WINFUNCTYPE
|
| 8 |
+
)
|
| 9 |
+
import os
|
| 10 |
+
import struct
|
| 11 |
+
import json
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 15 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 16 |
+
|
| 17 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 18 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 19 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 20 |
+
|
| 21 |
+
# Globals
|
| 22 |
+
decrypt_call_num = 0
|
| 23 |
+
_callback_refs = []
|
| 24 |
+
key_handle_to_material = {}
|
| 25 |
+
hash_handle_to_data = {}
|
| 26 |
+
alg_handle_to_name = {}
|
| 27 |
+
|
| 28 |
+
# Collect all crypto operations for JSON output
|
| 29 |
+
crypto_log = []
|
| 30 |
+
|
| 31 |
+
DECRYPT_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_void_p,
|
| 32 |
+
c_void_p, c_ulong, c_void_p, c_ulong, POINTER(c_ulong), c_ulong)
|
| 33 |
+
OPEN_ALG_T = WINFUNCTYPE(c_ulong, POINTER(c_void_p), c_void_p, c_void_p, c_ulong)
|
| 34 |
+
SET_PROP_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 35 |
+
GEN_KEY_T = WINFUNCTYPE(c_ulong, c_void_p, POINTER(c_void_p), c_void_p, c_ulong,
|
| 36 |
+
c_void_p, c_ulong, c_ulong)
|
| 37 |
+
CREATE_HASH_T = WINFUNCTYPE(c_ulong, c_void_p, POINTER(c_void_p), c_void_p, c_ulong,
|
| 38 |
+
c_void_p, c_ulong, c_ulong)
|
| 39 |
+
HASH_DATA_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 40 |
+
FINISH_HASH_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 41 |
+
ENCRYPT_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_void_p,
|
| 42 |
+
c_void_p, c_ulong, c_void_p, c_ulong, POINTER(c_ulong), c_ulong)
|
| 43 |
+
|
| 44 |
+
orig = {}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def read_wstr(ptr):
|
| 48 |
+
if not ptr:
|
| 49 |
+
return "<null>"
|
| 50 |
+
try:
|
| 51 |
+
return ctypes.wstring_at(ptr)
|
| 52 |
+
except:
|
| 53 |
+
return "<err>"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def hooked_open_alg(phAlgorithm, pszAlgId, pszImplementation, dwFlags):
|
| 57 |
+
alg_name = read_wstr(pszAlgId)
|
| 58 |
+
status = orig['OpenAlgorithmProvider'](phAlgorithm, pszAlgId, pszImplementation, dwFlags)
|
| 59 |
+
handle = phAlgorithm[0] if phAlgorithm else None
|
| 60 |
+
if handle:
|
| 61 |
+
h = handle.value if hasattr(handle, 'value') else handle
|
| 62 |
+
alg_handle_to_name[h] = alg_name
|
| 63 |
+
return status
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def hooked_set_prop(hObject, pszProperty, pbInput, cbInput, dwFlags):
|
| 67 |
+
return orig['SetProperty'](hObject, pszProperty, pbInput, cbInput, dwFlags)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def hooked_create_hash(hAlgorithm, phHash, pbHashObject, cbHashObject,
|
| 71 |
+
pbSecret, cbSecret, dwFlags):
|
| 72 |
+
status = orig['CreateHash'](hAlgorithm, phHash, pbHashObject, cbHashObject,
|
| 73 |
+
pbSecret, cbSecret, dwFlags)
|
| 74 |
+
hash_handle = phHash[0] if phHash else None
|
| 75 |
+
hmac_key = None
|
| 76 |
+
if pbSecret and cbSecret > 0:
|
| 77 |
+
hmac_key = ctypes.string_at(pbSecret, cbSecret)
|
| 78 |
+
hh = hash_handle.value if hasattr(hash_handle, 'value') else hash_handle
|
| 79 |
+
hash_handle_to_data[hh] = {'hmac_key': hmac_key, 'data_chunks': []}
|
| 80 |
+
return status
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def hooked_hash_data(hHash, pbInput, cbInput, dwFlags):
|
| 84 |
+
status = orig['HashData'](hHash, pbInput, cbInput, dwFlags)
|
| 85 |
+
hh = hHash.value if hasattr(hHash, 'value') else hHash
|
| 86 |
+
if pbInput and cbInput > 0:
|
| 87 |
+
data = ctypes.string_at(pbInput, cbInput)
|
| 88 |
+
if hh in hash_handle_to_data:
|
| 89 |
+
hash_handle_to_data[hh]['data_chunks'].append(data)
|
| 90 |
+
return status
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def hooked_finish_hash(hHash, pbOutput, cbOutput, dwFlags):
|
| 94 |
+
status = orig['FinishHash'](hHash, pbOutput, cbOutput, dwFlags)
|
| 95 |
+
hh = hHash.value if hasattr(hHash, 'value') else hHash
|
| 96 |
+
output = None
|
| 97 |
+
if pbOutput and cbOutput > 0:
|
| 98 |
+
output = ctypes.string_at(pbOutput, cbOutput)
|
| 99 |
+
info = hash_handle_to_data.get(hh)
|
| 100 |
+
if info and output:
|
| 101 |
+
all_data = b"".join(info['data_chunks'])
|
| 102 |
+
crypto_log.append({
|
| 103 |
+
'op': 'sha256',
|
| 104 |
+
'input': all_data.hex(),
|
| 105 |
+
'input_len': len(all_data),
|
| 106 |
+
'output': output.hex(),
|
| 107 |
+
})
|
| 108 |
+
return status
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def hooked_gen_key(hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 112 |
+
pbSecret, cbSecret, dwFlags):
|
| 113 |
+
secret = None
|
| 114 |
+
if pbSecret and cbSecret > 0:
|
| 115 |
+
secret = ctypes.string_at(pbSecret, cbSecret)
|
| 116 |
+
status = orig['GenerateSymmetricKey'](hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 117 |
+
pbSecret, cbSecret, dwFlags)
|
| 118 |
+
key_handle = phKey[0] if phKey else None
|
| 119 |
+
if key_handle and secret:
|
| 120 |
+
kh = key_handle.value if hasattr(key_handle, 'value') else key_handle
|
| 121 |
+
key_handle_to_material[kh] = secret
|
| 122 |
+
return status
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def hooked_encrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 126 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 127 |
+
status = orig['Encrypt'](hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 128 |
+
pbOutput, cbOutput, pcbResult, dwFlags)
|
| 129 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 130 |
+
if cbIV > 0:
|
| 131 |
+
iv = ctypes.string_at(pbIV, cbIV) if pbIV else None
|
| 132 |
+
enc_in = ctypes.string_at(pbInput, min(cbInput, 32)) if pbInput and cbInput > 0 else None
|
| 133 |
+
enc_out = ctypes.string_at(pbOutput, min(result_size, 32)) if pbOutput and result_size > 0 else None
|
| 134 |
+
kh = hKey.value if hasattr(hKey, 'value') else hKey
|
| 135 |
+
crypto_log.append({
|
| 136 |
+
'op': 'encrypt',
|
| 137 |
+
'input_size': cbInput,
|
| 138 |
+
'output_size': result_size,
|
| 139 |
+
'aes_key': key_handle_to_material.get(kh, b'').hex(),
|
| 140 |
+
'input_preview': enc_in.hex() if enc_in else None,
|
| 141 |
+
'output_preview': enc_out.hex() if enc_out else None,
|
| 142 |
+
})
|
| 143 |
+
return status
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def hooked_decrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 147 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 148 |
+
global decrypt_call_num
|
| 149 |
+
status = orig['Decrypt'](hKey, pbInput, cbInput, pPadding,
|
| 150 |
+
pbIV, cbIV, pbOutput, cbOutput, pcbResult, dwFlags)
|
| 151 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 152 |
+
|
| 153 |
+
if cbIV > 0:
|
| 154 |
+
call_num = decrypt_call_num
|
| 155 |
+
decrypt_call_num += 1
|
| 156 |
+
kh = hKey.value if hasattr(hKey, 'value') else hKey
|
| 157 |
+
aes_key = key_handle_to_material.get(kh, b'').hex()
|
| 158 |
+
|
| 159 |
+
dec_data = None
|
| 160 |
+
if status == 0 and result_size > 0 and pbOutput:
|
| 161 |
+
dec_data = ctypes.string_at(pbOutput, result_size)
|
| 162 |
+
fname = OUTPUT_DIR / f"decrypt_{call_num}_in{cbInput}_out{result_size}.bin"
|
| 163 |
+
fname.write_bytes(dec_data)
|
| 164 |
+
|
| 165 |
+
crypto_log.append({
|
| 166 |
+
'op': 'decrypt',
|
| 167 |
+
'call_num': call_num,
|
| 168 |
+
'input_size': cbInput,
|
| 169 |
+
'output_size': result_size,
|
| 170 |
+
'aes_key': aes_key,
|
| 171 |
+
'first_bytes': dec_data[:32].hex() if dec_data else None,
|
| 172 |
+
})
|
| 173 |
+
|
| 174 |
+
return status
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def hook_iat(dll_handle, target_dll, func_name, hook_func, func_type):
|
| 178 |
+
import pefile
|
| 179 |
+
kernel32 = ctypes.windll.kernel32
|
| 180 |
+
buf = ctypes.create_unicode_buffer(260)
|
| 181 |
+
kernel32.GetModuleFileNameW(ctypes.c_void_p(dll_handle), buf, 260)
|
| 182 |
+
pe = pefile.PE(buf.value)
|
| 183 |
+
for entry in pe.DIRECTORY_ENTRY_IMPORT:
|
| 184 |
+
if target_dll.lower() not in entry.dll.decode('utf-8', errors='ignore').lower():
|
| 185 |
+
continue
|
| 186 |
+
for imp in entry.imports:
|
| 187 |
+
if imp.name and imp.name.decode('utf-8', errors='ignore') == func_name:
|
| 188 |
+
iat_rva = imp.address - pe.OPTIONAL_HEADER.ImageBase
|
| 189 |
+
iat_addr = dll_handle + iat_rva
|
| 190 |
+
original_ptr = ctypes.c_void_p()
|
| 191 |
+
ctypes.memmove(ctypes.byref(original_ptr), iat_addr, 8)
|
| 192 |
+
callback = func_type(hook_func)
|
| 193 |
+
callback_ptr = ctypes.cast(callback, c_void_p).value
|
| 194 |
+
old_protect = c_ulong()
|
| 195 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, 0x04, byref(old_protect))
|
| 196 |
+
new_ptr = ctypes.c_void_p(callback_ptr)
|
| 197 |
+
ctypes.memmove(iat_addr, ctypes.byref(new_ptr), 8)
|
| 198 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, old_protect.value, byref(old_protect))
|
| 199 |
+
original_func = func_type(original_ptr.value)
|
| 200 |
+
pe.close()
|
| 201 |
+
_callback_refs.append(callback)
|
| 202 |
+
return original_func
|
| 203 |
+
pe.close()
|
| 204 |
+
return None
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def main():
|
| 208 |
+
print("BCrypt Full Hook - collecting all crypto operations to JSON...")
|
| 209 |
+
|
| 210 |
+
for f in OUTPUT_DIR.glob("decrypt_*.bin"):
|
| 211 |
+
f.unlink()
|
| 212 |
+
|
| 213 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 214 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 215 |
+
dll = ctypes.WinDLL(os.path.join(DLL_DIR, "oneocr.dll"))
|
| 216 |
+
|
| 217 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 218 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 219 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, c_ubyte]
|
| 220 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 221 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 222 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 223 |
+
|
| 224 |
+
import pefile # noqa
|
| 225 |
+
|
| 226 |
+
hooks = [
|
| 227 |
+
('bcrypt', 'BCryptOpenAlgorithmProvider', hooked_open_alg, OPEN_ALG_T),
|
| 228 |
+
('bcrypt', 'BCryptSetProperty', hooked_set_prop, SET_PROP_T),
|
| 229 |
+
('bcrypt', 'BCryptCreateHash', hooked_create_hash, CREATE_HASH_T),
|
| 230 |
+
('bcrypt', 'BCryptHashData', hooked_hash_data, HASH_DATA_T),
|
| 231 |
+
('bcrypt', 'BCryptFinishHash', hooked_finish_hash, FINISH_HASH_T),
|
| 232 |
+
('bcrypt', 'BCryptGenerateSymmetricKey', hooked_gen_key, GEN_KEY_T),
|
| 233 |
+
('bcrypt', 'BCryptEncrypt', hooked_encrypt, ENCRYPT_T),
|
| 234 |
+
('bcrypt', 'BCryptDecrypt', hooked_decrypt, DECRYPT_T),
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
for target_dll, func_name, hook_func, func_type in hooks:
|
| 238 |
+
o = hook_iat(dll._handle, target_dll, func_name, hook_func, func_type)
|
| 239 |
+
if o:
|
| 240 |
+
orig[func_name.replace('BCrypt', '')] = o
|
| 241 |
+
|
| 242 |
+
init_options = c_int64()
|
| 243 |
+
dll.CreateOcrInitOptions(byref(init_options))
|
| 244 |
+
dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 245 |
+
|
| 246 |
+
pipeline = c_int64()
|
| 247 |
+
ret = dll.CreateOcrPipeline(
|
| 248 |
+
ctypes.create_string_buffer(MODEL_PATH.encode()),
|
| 249 |
+
ctypes.create_string_buffer(KEY),
|
| 250 |
+
init_options, byref(pipeline)
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
print(f"CreateOcrPipeline: {ret}")
|
| 254 |
+
print(f"Total crypto ops: {len(crypto_log)}")
|
| 255 |
+
print(f"Decrypted chunks: {decrypt_call_num}")
|
| 256 |
+
|
| 257 |
+
# Save crypto log
|
| 258 |
+
out_path = Path("temp/crypto_log.json")
|
| 259 |
+
out_path.parent.mkdir(exist_ok=True)
|
| 260 |
+
out_path.write_text(json.dumps(crypto_log, indent=2))
|
| 261 |
+
print(f"Saved crypto log to {out_path}")
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
if __name__ == '__main__':
|
| 265 |
+
main()
|
_archive/hooks/hook_hash.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hook BCrypt hash functions (CreateHash, HashData, FinishHash) to discover
|
| 3 |
+
the key derivation scheme. Also hook GenerateSymmetricKey and BCryptDecrypt.
|
| 4 |
+
"""
|
| 5 |
+
import ctypes
|
| 6 |
+
from ctypes import (
|
| 7 |
+
c_int64, c_char_p, c_ubyte, POINTER, byref,
|
| 8 |
+
c_void_p, c_ulong, WINFUNCTYPE
|
| 9 |
+
)
|
| 10 |
+
import os
|
| 11 |
+
import struct
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
OUTPUT_DIR = Path(r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\frida_dump")
|
| 15 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 16 |
+
|
| 17 |
+
DLL_DIR = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data"
|
| 18 |
+
MODEL_PATH = os.path.join(DLL_DIR, "oneocr.onemodel")
|
| 19 |
+
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
|
| 20 |
+
|
| 21 |
+
# Globals
|
| 22 |
+
decrypt_call_num = 0
|
| 23 |
+
_callback_refs = []
|
| 24 |
+
key_handle_to_material = {}
|
| 25 |
+
hash_handle_to_data = {} # track hash data per handle
|
| 26 |
+
alg_handle_to_name = {}
|
| 27 |
+
|
| 28 |
+
# ── Function types ──
|
| 29 |
+
DECRYPT_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_void_p,
|
| 30 |
+
c_void_p, c_ulong, c_void_p, c_ulong, POINTER(c_ulong), c_ulong)
|
| 31 |
+
OPEN_ALG_T = WINFUNCTYPE(c_ulong, POINTER(c_void_p), c_void_p, c_void_p, c_ulong)
|
| 32 |
+
SET_PROP_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 33 |
+
GEN_KEY_T = WINFUNCTYPE(c_ulong, c_void_p, POINTER(c_void_p), c_void_p, c_ulong,
|
| 34 |
+
c_void_p, c_ulong, c_ulong)
|
| 35 |
+
|
| 36 |
+
# BCryptCreateHash(hAlgorithm, phHash, pbHashObject, cbHashObject,
|
| 37 |
+
# pbSecret, cbSecret, dwFlags)
|
| 38 |
+
CREATE_HASH_T = WINFUNCTYPE(c_ulong, c_void_p, POINTER(c_void_p), c_void_p, c_ulong,
|
| 39 |
+
c_void_p, c_ulong, c_ulong)
|
| 40 |
+
|
| 41 |
+
# BCryptHashData(hHash, pbInput, cbInput, dwFlags)
|
| 42 |
+
HASH_DATA_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 43 |
+
|
| 44 |
+
# BCryptFinishHash(hHash, pbOutput, cbOutput, dwFlags)
|
| 45 |
+
FINISH_HASH_T = WINFUNCTYPE(c_ulong, c_void_p, c_void_p, c_ulong, c_ulong)
|
| 46 |
+
|
| 47 |
+
# Originals
|
| 48 |
+
orig = {}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def read_wstr(ptr):
|
| 52 |
+
if not ptr:
|
| 53 |
+
return "<null>"
|
| 54 |
+
try:
|
| 55 |
+
return ctypes.wstring_at(ptr)
|
| 56 |
+
except:
|
| 57 |
+
return "<err>"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def hooked_open_alg(phAlgorithm, pszAlgId, pszImplementation, dwFlags):
|
| 61 |
+
alg_name = read_wstr(pszAlgId)
|
| 62 |
+
status = orig['OpenAlgorithmProvider'](phAlgorithm, pszAlgId, pszImplementation, dwFlags)
|
| 63 |
+
handle = phAlgorithm[0] if phAlgorithm else None
|
| 64 |
+
if handle:
|
| 65 |
+
h = handle.value if hasattr(handle, 'value') else handle
|
| 66 |
+
alg_handle_to_name[h] = alg_name
|
| 67 |
+
print(f"[OpenAlg] {alg_name!r} -> {status:#010x}")
|
| 68 |
+
return status
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def hooked_set_prop(hObject, pszProperty, pbInput, cbInput, dwFlags):
|
| 72 |
+
prop_name = read_wstr(pszProperty)
|
| 73 |
+
value = ""
|
| 74 |
+
if pbInput and cbInput > 0:
|
| 75 |
+
raw = ctypes.string_at(pbInput, cbInput)
|
| 76 |
+
try:
|
| 77 |
+
value = raw.decode('utf-16-le').rstrip('\x00')
|
| 78 |
+
except:
|
| 79 |
+
value = raw.hex()
|
| 80 |
+
if cbInput == 4:
|
| 81 |
+
value += f" (dword={struct.unpack('<I', raw)[0]})"
|
| 82 |
+
status = orig['SetProperty'](hObject, pszProperty, pbInput, cbInput, dwFlags)
|
| 83 |
+
print(f"[SetProp] {prop_name!r} = {value!r} -> {status:#010x}")
|
| 84 |
+
return status
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def hooked_create_hash(hAlgorithm, phHash, pbHashObject, cbHashObject,
|
| 88 |
+
pbSecret, cbSecret, dwFlags):
|
| 89 |
+
status = orig['CreateHash'](hAlgorithm, phHash, pbHashObject, cbHashObject,
|
| 90 |
+
pbSecret, cbSecret, dwFlags)
|
| 91 |
+
hash_handle = phHash[0] if phHash else None
|
| 92 |
+
|
| 93 |
+
hmac_key = None
|
| 94 |
+
if pbSecret and cbSecret > 0:
|
| 95 |
+
hmac_key = ctypes.string_at(pbSecret, cbSecret)
|
| 96 |
+
|
| 97 |
+
hh = hash_handle.value if hasattr(hash_handle, 'value') else hash_handle
|
| 98 |
+
ah = hAlgorithm.value if hasattr(hAlgorithm, 'value') else hAlgorithm
|
| 99 |
+
alg = alg_handle_to_name.get(ah, "?")
|
| 100 |
+
|
| 101 |
+
hash_handle_to_data[hh] = {
|
| 102 |
+
'alg': alg,
|
| 103 |
+
'hmac_key': hmac_key,
|
| 104 |
+
'data_chunks': [],
|
| 105 |
+
'total_len': 0,
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
hmac_info = ""
|
| 109 |
+
if hmac_key:
|
| 110 |
+
hmac_info = f" HMAC_KEY={hmac_key.hex()} ({hmac_key!r})"
|
| 111 |
+
|
| 112 |
+
print(f"[CreateHash] alg={alg} hash={hh:#x}{hmac_info} -> {status:#010x}")
|
| 113 |
+
return status
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def hooked_hash_data(hHash, pbInput, cbInput, dwFlags):
|
| 117 |
+
status = orig['HashData'](hHash, pbInput, cbInput, dwFlags)
|
| 118 |
+
|
| 119 |
+
hh = hHash.value if hasattr(hHash, 'value') else hHash
|
| 120 |
+
data_bytes = None
|
| 121 |
+
if pbInput and cbInput > 0:
|
| 122 |
+
data_bytes = ctypes.string_at(pbInput, cbInput)
|
| 123 |
+
|
| 124 |
+
if hh in hash_handle_to_data and data_bytes:
|
| 125 |
+
info = hash_handle_to_data[hh]
|
| 126 |
+
info['data_chunks'].append(data_bytes)
|
| 127 |
+
info['total_len'] += len(data_bytes)
|
| 128 |
+
|
| 129 |
+
# Show data
|
| 130 |
+
data_hex = data_bytes.hex() if data_bytes else ""
|
| 131 |
+
data_ascii = ""
|
| 132 |
+
if data_bytes:
|
| 133 |
+
try:
|
| 134 |
+
data_ascii = data_bytes.decode('ascii', errors='replace')
|
| 135 |
+
except:
|
| 136 |
+
pass
|
| 137 |
+
preview = data_hex[:128]
|
| 138 |
+
if len(data_hex) > 128:
|
| 139 |
+
preview += "..."
|
| 140 |
+
|
| 141 |
+
print(f"[HashData] hash={hh:#x} len={cbInput} data={preview}")
|
| 142 |
+
if data_ascii and all(32 <= c < 127 or c in (10, 13) for c in (data_bytes or b"")):
|
| 143 |
+
print(f" ASCII: {data_ascii!r}")
|
| 144 |
+
return status
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def hooked_finish_hash(hHash, pbOutput, cbOutput, dwFlags):
|
| 148 |
+
status = orig['FinishHash'](hHash, pbOutput, cbOutput, dwFlags)
|
| 149 |
+
|
| 150 |
+
hh = hHash.value if hasattr(hHash, 'value') else hHash
|
| 151 |
+
output = None
|
| 152 |
+
if pbOutput and cbOutput > 0:
|
| 153 |
+
output = ctypes.string_at(pbOutput, cbOutput)
|
| 154 |
+
|
| 155 |
+
info = hash_handle_to_data.get(hh)
|
| 156 |
+
all_data = b""
|
| 157 |
+
if info:
|
| 158 |
+
all_data = b"".join(info['data_chunks'])
|
| 159 |
+
|
| 160 |
+
print(f"[FinishHash] hash={hh:#x} output_len={cbOutput}")
|
| 161 |
+
if output:
|
| 162 |
+
print(f" Result: {output.hex()}")
|
| 163 |
+
if info:
|
| 164 |
+
print(f" Input was: {info['total_len']} bytes in {len(info['data_chunks'])} chunks")
|
| 165 |
+
if info['total_len'] <= 256:
|
| 166 |
+
print(f" Full input: {all_data.hex()}")
|
| 167 |
+
try:
|
| 168 |
+
print(f" Input ASCII: {all_data!r}")
|
| 169 |
+
except:
|
| 170 |
+
pass
|
| 171 |
+
if info['hmac_key']:
|
| 172 |
+
print(f" HMAC key: {info['hmac_key'].hex()}")
|
| 173 |
+
|
| 174 |
+
return status
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def hooked_gen_key(hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 178 |
+
pbSecret, cbSecret, dwFlags):
|
| 179 |
+
secret = None
|
| 180 |
+
if pbSecret and cbSecret > 0:
|
| 181 |
+
secret = ctypes.string_at(pbSecret, cbSecret)
|
| 182 |
+
|
| 183 |
+
status = orig['GenerateSymmetricKey'](hAlgorithm, phKey, pbKeyObject, cbKeyObject,
|
| 184 |
+
pbSecret, cbSecret, dwFlags)
|
| 185 |
+
|
| 186 |
+
key_handle = phKey[0] if phKey else None
|
| 187 |
+
if key_handle and secret:
|
| 188 |
+
kh = key_handle.value if hasattr(key_handle, 'value') else key_handle
|
| 189 |
+
key_handle_to_material[kh] = secret
|
| 190 |
+
|
| 191 |
+
print(f"[GenSymKey] secret_len={cbSecret} -> {status:#010x}")
|
| 192 |
+
if secret:
|
| 193 |
+
print(f" Secret: {secret.hex()}")
|
| 194 |
+
return status
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def hooked_decrypt(hKey, pbInput, cbInput, pPadding, pbIV, cbIV,
|
| 198 |
+
pbOutput, cbOutput, pcbResult, dwFlags):
|
| 199 |
+
global decrypt_call_num
|
| 200 |
+
|
| 201 |
+
iv_before = None
|
| 202 |
+
if pbIV and cbIV > 0:
|
| 203 |
+
iv_before = ctypes.string_at(pbIV, cbIV)
|
| 204 |
+
|
| 205 |
+
status = orig['Decrypt'](hKey, pbInput, cbInput, pPadding,
|
| 206 |
+
pbIV, cbIV, pbOutput, cbOutput, pcbResult, dwFlags)
|
| 207 |
+
|
| 208 |
+
result_size = pcbResult[0] if pcbResult else 0
|
| 209 |
+
|
| 210 |
+
# Only log actual decrypts (with IV), skip sizing calls
|
| 211 |
+
if cbIV > 0:
|
| 212 |
+
call_num = decrypt_call_num
|
| 213 |
+
decrypt_call_num += 1
|
| 214 |
+
|
| 215 |
+
kh = hKey.value if hasattr(hKey, 'value') else hKey
|
| 216 |
+
known_key = key_handle_to_material.get(kh)
|
| 217 |
+
|
| 218 |
+
print(f"[Decrypt #{call_num}] in={cbInput} out={result_size} iv_len={cbIV}")
|
| 219 |
+
if iv_before:
|
| 220 |
+
print(f" IV: {iv_before.hex()}")
|
| 221 |
+
if known_key:
|
| 222 |
+
print(f" AES key: {known_key.hex()}")
|
| 223 |
+
|
| 224 |
+
if status == 0 and result_size > 0 and pbOutput:
|
| 225 |
+
decrypted = ctypes.string_at(pbOutput, result_size)
|
| 226 |
+
print(f" Decrypted[:32]: {decrypted[:32].hex()}")
|
| 227 |
+
fname = OUTPUT_DIR / f"decrypt_{call_num}_in{cbInput}_out{result_size}.bin"
|
| 228 |
+
fname.write_bytes(decrypted)
|
| 229 |
+
|
| 230 |
+
return status
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def hook_iat(dll_handle, target_dll, func_name, hook_func, func_type):
|
| 234 |
+
import pefile
|
| 235 |
+
kernel32 = ctypes.windll.kernel32
|
| 236 |
+
buf = ctypes.create_unicode_buffer(260)
|
| 237 |
+
kernel32.GetModuleFileNameW(ctypes.c_void_p(dll_handle), buf, 260)
|
| 238 |
+
pe = pefile.PE(buf.value)
|
| 239 |
+
|
| 240 |
+
for entry in pe.DIRECTORY_ENTRY_IMPORT:
|
| 241 |
+
if target_dll.lower() not in entry.dll.decode('utf-8', errors='ignore').lower():
|
| 242 |
+
continue
|
| 243 |
+
for imp in entry.imports:
|
| 244 |
+
if imp.name and imp.name.decode('utf-8', errors='ignore') == func_name:
|
| 245 |
+
iat_rva = imp.address - pe.OPTIONAL_HEADER.ImageBase
|
| 246 |
+
iat_addr = dll_handle + iat_rva
|
| 247 |
+
|
| 248 |
+
original_ptr = ctypes.c_void_p()
|
| 249 |
+
ctypes.memmove(ctypes.byref(original_ptr), iat_addr, 8)
|
| 250 |
+
|
| 251 |
+
callback = func_type(hook_func)
|
| 252 |
+
callback_ptr = ctypes.cast(callback, c_void_p).value
|
| 253 |
+
|
| 254 |
+
old_protect = c_ulong()
|
| 255 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, 0x04, byref(old_protect))
|
| 256 |
+
new_ptr = ctypes.c_void_p(callback_ptr)
|
| 257 |
+
ctypes.memmove(iat_addr, ctypes.byref(new_ptr), 8)
|
| 258 |
+
kernel32.VirtualProtect(ctypes.c_void_p(iat_addr), 8, old_protect.value, byref(old_protect))
|
| 259 |
+
|
| 260 |
+
original_func = func_type(original_ptr.value)
|
| 261 |
+
pe.close()
|
| 262 |
+
_callback_refs.append(callback)
|
| 263 |
+
return original_func
|
| 264 |
+
|
| 265 |
+
pe.close()
|
| 266 |
+
return None
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def main():
|
| 270 |
+
print("=" * 70)
|
| 271 |
+
print("BCrypt HASH HOOK - Discover SHA256 key derivation input")
|
| 272 |
+
print("=" * 70)
|
| 273 |
+
|
| 274 |
+
for f in OUTPUT_DIR.glob("decrypt_*.bin"):
|
| 275 |
+
f.unlink()
|
| 276 |
+
|
| 277 |
+
kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)
|
| 278 |
+
kernel32.SetDllDirectoryW(DLL_DIR)
|
| 279 |
+
|
| 280 |
+
dll_path = os.path.join(DLL_DIR, "oneocr.dll")
|
| 281 |
+
print(f"Loading: {dll_path}")
|
| 282 |
+
dll = ctypes.WinDLL(dll_path)
|
| 283 |
+
|
| 284 |
+
dll.CreateOcrInitOptions.argtypes = [POINTER(c_int64)]
|
| 285 |
+
dll.CreateOcrInitOptions.restype = c_int64
|
| 286 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.argtypes = [c_int64, c_ubyte]
|
| 287 |
+
dll.OcrInitOptionsSetUseModelDelayLoad.restype = c_int64
|
| 288 |
+
dll.CreateOcrPipeline.argtypes = [c_char_p, c_char_p, c_int64, POINTER(c_int64)]
|
| 289 |
+
dll.CreateOcrPipeline.restype = c_int64
|
| 290 |
+
|
| 291 |
+
import pefile # noqa
|
| 292 |
+
|
| 293 |
+
hooks = [
|
| 294 |
+
('bcrypt', 'BCryptOpenAlgorithmProvider', hooked_open_alg, OPEN_ALG_T),
|
| 295 |
+
('bcrypt', 'BCryptSetProperty', hooked_set_prop, SET_PROP_T),
|
| 296 |
+
('bcrypt', 'BCryptCreateHash', hooked_create_hash, CREATE_HASH_T),
|
| 297 |
+
('bcrypt', 'BCryptHashData', hooked_hash_data, HASH_DATA_T),
|
| 298 |
+
('bcrypt', 'BCryptFinishHash', hooked_finish_hash, FINISH_HASH_T),
|
| 299 |
+
('bcrypt', 'BCryptGenerateSymmetricKey', hooked_gen_key, GEN_KEY_T),
|
| 300 |
+
('bcrypt', 'BCryptDecrypt', hooked_decrypt, DECRYPT_T),
|
| 301 |
+
]
|
| 302 |
+
|
| 303 |
+
print("\n--- Installing hooks ---")
|
| 304 |
+
for target_dll, func_name, hook_func, func_type in hooks:
|
| 305 |
+
o = hook_iat(dll._handle, target_dll, func_name, hook_func, func_type)
|
| 306 |
+
if o:
|
| 307 |
+
short = func_name.replace('BCrypt', '')
|
| 308 |
+
orig[short] = o
|
| 309 |
+
print(f" OK: {func_name}")
|
| 310 |
+
else:
|
| 311 |
+
print(f" FAIL: {func_name}")
|
| 312 |
+
|
| 313 |
+
print("\n--- Creating OCR Pipeline (triggers crypto) ---")
|
| 314 |
+
init_options = c_int64()
|
| 315 |
+
dll.CreateOcrInitOptions(byref(init_options))
|
| 316 |
+
dll.OcrInitOptionsSetUseModelDelayLoad(init_options, 0)
|
| 317 |
+
|
| 318 |
+
pipeline = c_int64()
|
| 319 |
+
model_buf = ctypes.create_string_buffer(MODEL_PATH.encode())
|
| 320 |
+
key_buf = ctypes.create_string_buffer(KEY)
|
| 321 |
+
|
| 322 |
+
print(f"Model: {MODEL_PATH}")
|
| 323 |
+
print(f"Key: {KEY}")
|
| 324 |
+
print()
|
| 325 |
+
|
| 326 |
+
ret = dll.CreateOcrPipeline(model_buf, key_buf, init_options, byref(pipeline))
|
| 327 |
+
print(f"\nCreateOcrPipeline: {ret}")
|
| 328 |
+
|
| 329 |
+
# Summary
|
| 330 |
+
print("\n" + "=" * 70)
|
| 331 |
+
print("KEY DERIVATION SUMMARY")
|
| 332 |
+
print("=" * 70)
|
| 333 |
+
print(f"Unique derived keys: {len(key_handle_to_material)}")
|
| 334 |
+
print(f"Hash operations tracked: {len(hash_handle_to_data)}")
|
| 335 |
+
print(f"Decrypted chunks: {decrypt_call_num}")
|
| 336 |
+
print("\nDone!")
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
if __name__ == '__main__':
|
| 340 |
+
main()
|
_archive/inspect_config_blob.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deep-dive into model_11 and model_22 graph structure — handle binary config."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 7 |
+
|
| 8 |
+
for idx in [11, 22]:
|
| 9 |
+
matches = list(models_dir.glob(f"model_{idx:02d}_*"))
|
| 10 |
+
model = onnx.load(str(matches[0]))
|
| 11 |
+
|
| 12 |
+
print(f"\n{'='*70}")
|
| 13 |
+
print(f"FULL GRAPH: model_{idx:02d}")
|
| 14 |
+
print(f"{'='*70}")
|
| 15 |
+
|
| 16 |
+
# All initializers (weights)
|
| 17 |
+
print(f"\n Initializers ({len(model.graph.initializer)}):")
|
| 18 |
+
for init in model.graph.initializer:
|
| 19 |
+
if init.data_type == 8: # STRING
|
| 20 |
+
raw = init.string_data[0] if init.string_data else init.raw_data
|
| 21 |
+
print(f" {init.name}: STRING, {len(raw)} bytes (binary)")
|
| 22 |
+
else:
|
| 23 |
+
data = onnx.numpy_helper.to_array(init)
|
| 24 |
+
print(f" {init.name}: shape={data.shape}, dtype={data.dtype}, "
|
| 25 |
+
f"range=[{data.min():.4f}, {data.max():.4f}]")
|
| 26 |
+
|
| 27 |
+
# All nodes
|
| 28 |
+
print(f"\n Nodes ({len(model.graph.node)}):")
|
| 29 |
+
for i, node in enumerate(model.graph.node):
|
| 30 |
+
domain_str = f" [{node.domain}]" if node.domain else ""
|
| 31 |
+
print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)} → {list(node.output)}")
|
| 32 |
+
for attr in node.attribute:
|
| 33 |
+
if attr.type == 2:
|
| 34 |
+
print(f" {attr.name} = {attr.i}")
|
| 35 |
+
elif attr.type == 1:
|
| 36 |
+
print(f" {attr.name} = {attr.f}")
|
| 37 |
+
elif attr.type == 7:
|
| 38 |
+
print(f" {attr.name} = {list(attr.ints)}")
|
| 39 |
+
|
| 40 |
+
# Analyze feature/config blob
|
| 41 |
+
for init in model.graph.initializer:
|
| 42 |
+
if "config" in init.name.lower():
|
| 43 |
+
raw = init.string_data[0] if init.string_data else init.raw_data
|
| 44 |
+
blob = bytes(raw)
|
| 45 |
+
print(f"\n ── feature/config analysis ──")
|
| 46 |
+
print(f" Total bytes: {len(blob)}")
|
| 47 |
+
print(f" First 32 bytes hex: {blob[:32].hex()}")
|
| 48 |
+
|
| 49 |
+
# Hypothesis: header + weight_matrix(input_dim × output_dim) + bias(output_dim)
|
| 50 |
+
# If input=21, output=50: 21*50=1050 floats = 4200 bytes, bias=50 floats = 200 bytes
|
| 51 |
+
# Total weights = 4400 bytes, header = 4492-4400 = 92 bytes
|
| 52 |
+
|
| 53 |
+
# Try reading first few uint32 as header
|
| 54 |
+
header_u32 = [int.from_bytes(blob[i:i+4], 'little') for i in range(0, min(96, len(blob)), 4)]
|
| 55 |
+
print(f" First 24 uint32 LE values: {header_u32}")
|
| 56 |
+
|
| 57 |
+
# Try float32 interpretation after various offsets
|
| 58 |
+
for offset in [0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92]:
|
| 59 |
+
remaining = len(blob) - offset
|
| 60 |
+
n_floats = remaining // 4
|
| 61 |
+
if n_floats == 0:
|
| 62 |
+
continue
|
| 63 |
+
arr = np.frombuffer(blob[offset:offset + n_floats*4], dtype=np.float32)
|
| 64 |
+
valid = np.isfinite(arr).sum()
|
| 65 |
+
reasonable = np.sum((np.abs(arr) < 10) & np.isfinite(arr))
|
| 66 |
+
if reasonable > n_floats * 0.7: # >70% reasonable values
|
| 67 |
+
print(f" *** offset={offset}: {n_floats} floats, {valid} finite, "
|
| 68 |
+
f"{reasonable} in [-10,10] ({100*reasonable/n_floats:.0f}%)")
|
| 69 |
+
print(f" First 10: {arr[:10]}")
|
| 70 |
+
print(f" Stats: mean={arr.mean():.4f}, std={arr.std():.4f}")
|
| 71 |
+
# Check if it could be weight matrix 21×50
|
| 72 |
+
if n_floats >= 1050 + 50:
|
| 73 |
+
W = arr[:1050].reshape(21, 50)
|
| 74 |
+
b = arr[1050:1100]
|
| 75 |
+
print(f" As 21×50 weight: W_range=[{W.min():.4f},{W.max():.4f}], "
|
| 76 |
+
f"b_range=[{b.min():.4f},{b.max():.4f}]")
|
| 77 |
+
# Test with random input
|
| 78 |
+
x = np.random.randn(1, 21).astype(np.float32)
|
| 79 |
+
y = x @ W + b
|
| 80 |
+
print(f" Test: input(21) → output(50), y_range=[{y.min():.4f},{y.max():.4f}]")
|
_archive/inspect_custom_ops.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Inspect custom ops in models 11, 12, 22, 33 to determine exact op names and domains."""
|
| 2 |
+
import onnx
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 6 |
+
|
| 7 |
+
for idx in [11, 12, 22, 33]:
|
| 8 |
+
matches = list(models_dir.glob(f"model_{idx:02d}_*"))
|
| 9 |
+
if not matches:
|
| 10 |
+
print(f"model_{idx:02d}: NOT FOUND")
|
| 11 |
+
continue
|
| 12 |
+
|
| 13 |
+
model = onnx.load(str(matches[0]))
|
| 14 |
+
print(f"\n{'='*60}")
|
| 15 |
+
print(f"model_{idx:02d}: {matches[0].name}")
|
| 16 |
+
print(f" IR version: {model.ir_version}")
|
| 17 |
+
print(f" Opset imports: {[(o.domain, o.version) for o in model.opset_import]}")
|
| 18 |
+
|
| 19 |
+
# Find all non-standard ops
|
| 20 |
+
for node in model.graph.node:
|
| 21 |
+
if node.domain and node.domain != "":
|
| 22 |
+
print(f" Node: op_type={node.op_type!r}, domain={node.domain!r}")
|
| 23 |
+
print(f" inputs: {list(node.input)}")
|
| 24 |
+
print(f" outputs: {list(node.output)}")
|
| 25 |
+
# Print attributes
|
| 26 |
+
for attr in node.attribute:
|
| 27 |
+
if attr.type == 2: # INT
|
| 28 |
+
print(f" attr {attr.name} = {attr.i}")
|
| 29 |
+
elif attr.type == 1: # FLOAT
|
| 30 |
+
print(f" attr {attr.name} = {attr.f}")
|
| 31 |
+
elif attr.type == 3: # STRING
|
| 32 |
+
print(f" attr {attr.name} = {attr.s.decode()!r}")
|
| 33 |
+
elif attr.type == 4: # TENSOR
|
| 34 |
+
t = attr.t
|
| 35 |
+
print(f" attr {attr.name} = tensor(dtype={t.data_type}, dims={list(t.dims)}, raw_bytes={len(t.raw_data)})")
|
| 36 |
+
|
| 37 |
+
# Also show graph inputs/outputs
|
| 38 |
+
print(f" Graph inputs: {[(i.name, [d.dim_value or d.dim_param for d in i.type.tensor_type.shape.dim]) for i in model.graph.input]}")
|
| 39 |
+
print(f" Graph outputs: {[(o.name, [d.dim_value or d.dim_param for d in o.type.tensor_type.shape.dim]) for o in model.graph.output]}")
|
_archive/inspect_graph_deep.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deep-dive into model_11 and model_22 graph structure to understand OneOCRFeatureExtract."""
|
| 2 |
+
import onnx
|
| 3 |
+
import numpy as np
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
models_dir = Path("oneocr_extracted/onnx_models")
|
| 7 |
+
|
| 8 |
+
for idx in [11, 22]:
|
| 9 |
+
matches = list(models_dir.glob(f"model_{idx:02d}_*"))
|
| 10 |
+
model = onnx.load(str(matches[0]))
|
| 11 |
+
|
| 12 |
+
print(f"\n{'='*70}")
|
| 13 |
+
print(f"FULL GRAPH: model_{idx:02d}")
|
| 14 |
+
print(f"{'='*70}")
|
| 15 |
+
|
| 16 |
+
# All initializers (weights)
|
| 17 |
+
print(f"\n Initializers ({len(model.graph.initializer)}):")
|
| 18 |
+
for init in model.graph.initializer:
|
| 19 |
+
data = onnx.numpy_helper.to_array(init)
|
| 20 |
+
print(f" {init.name}: shape={data.shape}, dtype={data.dtype}, "
|
| 21 |
+
f"range=[{data.min():.4f}, {data.max():.4f}]")
|
| 22 |
+
|
| 23 |
+
# All nodes
|
| 24 |
+
print(f"\n Nodes ({len(model.graph.node)}):")
|
| 25 |
+
for i, node in enumerate(model.graph.node):
|
| 26 |
+
domain_str = f" (domain={node.domain!r})" if node.domain else ""
|
| 27 |
+
print(f" [{i}] {node.op_type}{domain_str}")
|
| 28 |
+
print(f" in: {list(node.input)}")
|
| 29 |
+
print(f" out: {list(node.output)}")
|
| 30 |
+
for attr in node.attribute:
|
| 31 |
+
if attr.type == 2: # INT
|
| 32 |
+
print(f" {attr.name} = {attr.i}")
|
| 33 |
+
elif attr.type == 1: # FLOAT
|
| 34 |
+
print(f" {attr.name} = {attr.f}")
|
| 35 |
+
elif attr.type == 3: # STRING
|
| 36 |
+
val = attr.s
|
| 37 |
+
if len(val) > 100:
|
| 38 |
+
print(f" {attr.name} = bytes({len(val)})")
|
| 39 |
+
else:
|
| 40 |
+
print(f" {attr.name} = {val!r}")
|
| 41 |
+
elif attr.type == 4: # TENSOR
|
| 42 |
+
t = attr.t
|
| 43 |
+
print(f" {attr.name} = tensor(dtype={t.data_type}, dims={list(t.dims)}, "
|
| 44 |
+
f"raw_bytes={len(t.raw_data)})")
|
| 45 |
+
elif attr.type == 7: # INTS
|
| 46 |
+
print(f" {attr.name} = {list(attr.ints)}")
|
| 47 |
+
elif attr.type == 6: # FLOATS
|
| 48 |
+
print(f" {attr.name} = {list(attr.floats)[:10]}...")
|
| 49 |
+
|
| 50 |
+
# Show feature/config initializer details
|
| 51 |
+
for init in model.graph.initializer:
|
| 52 |
+
if "config" in init.name.lower() or "feature" in init.name.lower():
|
| 53 |
+
raw = init.raw_data
|
| 54 |
+
print(f"\n feature/config blob: {len(raw)} bytes")
|
| 55 |
+
print(f" First 64 bytes (hex): {raw[:64].hex()}")
|
| 56 |
+
print(f" Last 32 bytes (hex): {raw[-32:].hex()}")
|
| 57 |
+
# Try to interpret structure
|
| 58 |
+
# Check if starts with dimension info
|
| 59 |
+
print(f" As uint32 first 8 values: {[int.from_bytes(raw[i:i+4], 'little') for i in range(0, 32, 4)]}")
|
| 60 |
+
print(f" As float32 first 8 values: {list(np.frombuffer(raw[:32], dtype=np.float32))}")
|