oneocr / _archive /analyze_lm_features.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Understand what the 21-dim input features are for LM models 11-32.
These models take data[1,21,1,1] → softmax[1,2] (binary classifier).
We need to figure out what 21 features to compute from the recognizer output."""
import onnx
from onnx import numpy_helper
import numpy as np
from pathlib import Path
import onnxruntime as ort
# The 21 input features likely come from CTC recognizer statistics.
# Let's test with the unlocked models using some hypothetical feature vectors.
models_dir = Path("oneocr_extracted/onnx_models_unlocked")
# Load a LangSm model (model_11 = Latin LangSm)
sess_sm = ort.InferenceSession(str(list(models_dir.glob("model_11_*"))[0]))
# Load a LangMd model (model_22 = Latin LangMd)
sess_md = ort.InferenceSession(str(list(models_dir.glob("model_22_*"))[0]))
print("LangSm (model_11) inputs:", [(i.name, i.shape, i.type) for i in sess_sm.get_inputs()])
print("LangSm (model_11) outputs:", [(o.name, o.shape, o.type) for o in sess_sm.get_outputs()])
print()
print("LangMd (model_22) inputs:", [(i.name, i.shape, i.type) for i in sess_md.get_inputs()])
print("LangMd (model_22) outputs:", [(o.name, o.shape, o.type) for o in sess_md.get_outputs()])
# The normalization constants inside the model tell us about expected feature ranges
# From earlier analysis:
# Add constant: [-1.273, 0.396, 0.134, 0.151, 0.084, 0.346, 0.472, 0.435,
# 0.346, 0.581, 0.312, 0.036, 0.045, 0.033, 0.026, 0.022,
# 0.044, 0.038, 0.029, 0.031, 0.696]
# Div constant: [0.641, 0.914, 0.377, 0.399, 0.302, 0.657, 0.814, 0.769,
# 0.658, 0.878, 0.617, 0.153, 0.166, 0.137, 0.120, 0.108,
# 0.132, 0.115, 0.105, 0.108, 0.385]
#
# This means typical feature ranges are:
# feature[0]: mean = 1.273, std = 0.641 (large negative offset → feature is centered around 1.27)
# feature[20]: mean = -0.696, std = 0.385
#
# Features 0: Large range → possibly average log-probability or entropy
# Features 1-10: Medium range → possibly per-class probabilities or scores
# Features 11-20: Small range → possibly confidence statistics
# Let's check: extract normalization params from model_11
model_11 = onnx.load(str(list(Path("oneocr_extracted/onnx_models").glob("model_11_*"))[0]))
for node in model_11.graph.node:
if node.op_type == "Constant":
name = node.output[0]
if name in ['26', '28']: # Add and Div constants
for attr in node.attribute:
if attr.type == 4:
data = numpy_helper.to_array(attr.t)
label = "Add (=-mean)" if name == '26' else "Div (=std)"
print(f"\n{label}: {data.flatten()}")
# The mean tells us the expected center of each feature
if name == '26':
# mean = -add_const
means = -data.flatten()
print(f" Implied means: {means}")
# Hypothesis: The 21 features are CTC decoder statistics:
# Based on the normalization centers (means):
# feat[0]: ~1.27 — could be average negative log-likelihood (NLL) per character
# feat[1]: ~-0.40 — could be a score
# feat[2-10]: ~0-0.5 — could be per-script probabilities from ScriptID
# feat[11-20]: ~0-0.04 — could be character-level statistics
# Let's test what outputs the recognizer produces
rec_path = list(Path("oneocr_extracted/onnx_models").glob("model_02_*"))[0]
rec_sess = ort.InferenceSession(str(rec_path))
print(f"\nRecognizer (model_02) outputs:")
for o in rec_sess.get_outputs():
print(f" {o.name}: {o.shape}")
# Try running recognizer and computing statistics
test_data = np.random.randn(1, 3, 60, 200).astype(np.float32) * 0.1
seq_lengths = np.array([50], dtype=np.int32) # 200/4
result = rec_sess.run(None, {"data": test_data, "seq_lengths": seq_lengths})
logprobs = result[0]
print(f"\nRecognizer output: {logprobs.shape}")
print(f" Log-prob range: [{logprobs.min():.4f}, {logprobs.max():.4f}]")
# Compute possible features from recognizer output:
lp = logprobs[:, 0, :] # [T, num_classes]
best_probs = np.exp(lp.max(axis=-1)) # Best probability per frame
mean_best = best_probs.mean()
print(f"\n Mean best prob per frame: {mean_best:.4f}")
print(f" Mean log-prob max: {lp.max(axis=-1).mean():.4f}")
print(f" Entropy per frame: {(-np.exp(lp) * lp).sum(axis=-1).mean():.4f}")
# The 21 features might be computed as:
# feat[0] = average log-probability (NLL) → how confident the model is
# feat[1..K] = character frequency statistics
# feat[K+1..20] = transition statistics
#
# Without the exact feature computation code from the DLL, we'll need to
# reverse-engineer or approximate the feature vector.
# For now, test the LM models with various feature values
print(f"\n--- Testing LM models with various inputs ---")
for name, features in [
("all_zeros", np.zeros(21)),
("high_conf", np.array([0.0, 0.5, 0.9, 0.9, 0.9, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1.0])),
("low_conf", np.array([3.0, -0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1])),
("typical", np.array([1.2, -0.4, 0.1, 0.15, 0.08, 0.35, 0.47, 0.43, 0.35, 0.58, 0.31, 0.04, 0.05, 0.03, 0.03, 0.02, 0.04, 0.04, 0.03, 0.03, 0.7])),
]:
data = features.astype(np.float32).reshape(1, 21, 1, 1)
sm_out = sess_sm.run(None, {"data": data})[0]
md_out = sess_md.run(None, {"data": data})[0]
print(f" {name:12s}: LangSm={sm_out.flatten()}, LangMd={md_out.flatten()}")