File size: 3,073 Bytes
ce847d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
"""Decode OneOCRFeatureExtract config blob."""
import onnx
import numpy as np
import struct
from pathlib import Path
m = onnx.load('oneocr_extracted/onnx_models/model_11_ir6_1.9_26KB.onnx')
for init in m.graph.initializer:
if init.name == 'feature/config':
raw = init.string_data[0]
print(f'Total bytes: {len(raw)}')
print(f'First 100 bytes hex: {raw[:100].hex()}')
# Try different structure interpretations
for offset in [0, 4, 8, 12]:
vals = struct.unpack_from('<4f', raw, offset)
print(f'Offset {offset:3d} as 4xfloat32: {vals}')
# Parse rnn_info to find LogPrior values
rnn = Path('oneocr_extracted/config_data/chunk_36_rnn_info.rnn_info').read_text()
rnn_lines = rnn.strip().split('\n')
lp_count = int(rnn_lines[0].split()[-1])
print(f'\nLogPrior count from rnn_info: {lp_count}')
lp_val = float(rnn_lines[1])
print(f'LogPrior[0] = {lp_val}')
lp_f32 = struct.pack('<f', np.float32(lp_val))
lp_f64 = struct.pack('<d', lp_val)
pos_f32 = raw.find(lp_f32)
pos_f64 = raw.find(lp_f64)
print(f'LogPrior as float32 at pos: {pos_f32}')
print(f'LogPrior as float64 at pos: {pos_f64}')
# Just look at data structure sections
# Check for repeating patterns, zeros, etc.
arr_f32 = np.frombuffer(raw, dtype=np.float32)
# Find sections of "reasonable" float values
reasonable = (np.abs(arr_f32) < 20) & (arr_f32 != 0)
transitions = np.diff(reasonable.astype(int))
starts = np.where(transitions == 1)[0] + 1
ends = np.where(transitions == -1)[0] + 1
print(f'\nSections of reasonable float32 values:')
for s, e in zip(starts[:10], ends[:10]):
print(f' [{s}:{e}] ({e-s} values) first: {arr_f32[s:s+3]}')
# Check if first few bytes are a header
header_ints = struct.unpack_from('<8I', raw, 0)
print(f'\nFirst 8 uint32: {header_ints}')
header_shorts = struct.unpack_from('<16H', raw, 0)
print(f'First 16 uint16: {header_shorts}')
# Maybe it's a rnn_info-like structure embedded
# The rnn_info has sections: <LogPrior>, <TransMat>, <LmSmall>/<LmMedium>
# Let's check the rnn_info structure fully
print('\n=== rnn_info structure ===')
section = None
counts = {}
for line in rnn_lines:
if line.startswith('<') and line.endswith('>'):
section = line
elif line.startswith('<') and '>' in line:
parts = line.strip().split()
section = parts[0].rstrip('>')+'>'
count = int(parts[-1]) if len(parts) > 1 else 0
counts[section] = count
print(f'Section: {section} count={count}')
else:
if section and section not in counts:
counts[section] = 0
print(f'Sections found: {counts}')
|