oneocr

File size: 5,545 Bytes

ce847d4

"""Understand what the 21-dim input features are for LM models 11-32.
These models take data[1,21,1,1] → softmax[1,2] (binary classifier).
We need to figure out what 21 features to compute from the recognizer output."""
import onnx
from onnx import numpy_helper
import numpy as np
from pathlib import Path
import onnxruntime as ort

# The 21 input features likely come from CTC recognizer statistics.
# Let's test with the unlocked models using some hypothetical feature vectors.

models_dir = Path("oneocr_extracted/onnx_models_unlocked")

# Load a LangSm model (model_11 = Latin LangSm)
sess_sm = ort.InferenceSession(str(list(models_dir.glob("model_11_*"))[0]))
# Load a LangMd model (model_22 = Latin LangMd)
sess_md = ort.InferenceSession(str(list(models_dir.glob("model_22_*"))[0]))

print("LangSm (model_11) inputs:", [(i.name, i.shape, i.type) for i in sess_sm.get_inputs()])
print("LangSm (model_11) outputs:", [(o.name, o.shape, o.type) for o in sess_sm.get_outputs()])
print()
print("LangMd (model_22) inputs:", [(i.name, i.shape, i.type) for i in sess_md.get_inputs()])
print("LangMd (model_22) outputs:", [(o.name, o.shape, o.type) for o in sess_md.get_outputs()])

# The normalization constants inside the model tell us about expected feature ranges
# From earlier analysis:
# Add constant: [-1.273, 0.396, 0.134, 0.151, 0.084, 0.346, 0.472, 0.435, 
#                 0.346, 0.581, 0.312, 0.036, 0.045, 0.033, 0.026, 0.022,
#                 0.044, 0.038, 0.029, 0.031, 0.696]
# Div constant: [0.641, 0.914, 0.377, 0.399, 0.302, 0.657, 0.814, 0.769,
#                 0.658, 0.878, 0.617, 0.153, 0.166, 0.137, 0.120, 0.108,
#                 0.132, 0.115, 0.105, 0.108, 0.385]
# 
# This means typical feature ranges are:
# feature[0]: mean = 1.273, std = 0.641 (large negative offset → feature is centered around 1.27)
# feature[20]: mean = -0.696, std = 0.385 
#
# Features 0: Large range → possibly average log-probability or entropy
# Features 1-10: Medium range → possibly per-class probabilities or scores
# Features 11-20: Small range → possibly confidence statistics

# Let's check: extract normalization params from model_11
model_11 = onnx.load(str(list(Path("oneocr_extracted/onnx_models").glob("model_11_*"))[0]))

for node in model_11.graph.node:
    if node.op_type == "Constant":
        name = node.output[0]
        if name in ['26', '28']:  # Add and Div constants
            for attr in node.attribute:
                if attr.type == 4:
                    data = numpy_helper.to_array(attr.t)
                    label = "Add (=-mean)" if name == '26' else "Div (=std)"
                    print(f"\n{label}: {data.flatten()}")
                    # The mean tells us the expected center of each feature
                    if name == '26':
                        # mean = -add_const
                        means = -data.flatten()
                        print(f"  Implied means: {means}")

# Hypothesis: The 21 features are CTC decoder statistics:
# Based on the normalization centers (means):
# feat[0]:  ~1.27 — could be average negative log-likelihood (NLL) per character
# feat[1]:  ~-0.40 — could be a score
# feat[2-10]: ~0-0.5 — could be per-script probabilities from ScriptID
# feat[11-20]: ~0-0.04 — could be character-level statistics

# Let's test what outputs the recognizer produces
rec_path = list(Path("oneocr_extracted/onnx_models").glob("model_02_*"))[0]
rec_sess = ort.InferenceSession(str(rec_path))
print(f"\nRecognizer (model_02) outputs:")
for o in rec_sess.get_outputs():
    print(f"  {o.name}: {o.shape}")

# Try running recognizer and computing statistics
test_data = np.random.randn(1, 3, 60, 200).astype(np.float32) * 0.1
seq_lengths = np.array([50], dtype=np.int32)  # 200/4
result = rec_sess.run(None, {"data": test_data, "seq_lengths": seq_lengths})
logprobs = result[0]
print(f"\nRecognizer output: {logprobs.shape}")
print(f"  Log-prob range: [{logprobs.min():.4f}, {logprobs.max():.4f}]")

# Compute possible features from recognizer output:
lp = logprobs[:, 0, :]  # [T, num_classes]
best_probs = np.exp(lp.max(axis=-1))  # Best probability per frame
mean_best = best_probs.mean()
print(f"\n  Mean best prob per frame: {mean_best:.4f}")
print(f"  Mean log-prob max: {lp.max(axis=-1).mean():.4f}")
print(f"  Entropy per frame: {(-np.exp(lp) * lp).sum(axis=-1).mean():.4f}")

# The 21 features might be computed as:
# feat[0] = average log-probability (NLL) → how confident the model is
# feat[1..K] = character frequency statistics 
# feat[K+1..20] = transition statistics
#
# Without the exact feature computation code from the DLL, we'll need to
# reverse-engineer or approximate the feature vector.

# For now, test the LM models with various feature values
print(f"\n--- Testing LM models with various inputs ---")
for name, features in [
    ("all_zeros", np.zeros(21)),
    ("high_conf", np.array([0.0, 0.5, 0.9, 0.9, 0.9, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1.0])),
    ("low_conf", np.array([3.0, -0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1])),
    ("typical", np.array([1.2, -0.4, 0.1, 0.15, 0.08, 0.35, 0.47, 0.43, 0.35, 0.58, 0.31, 0.04, 0.05, 0.03, 0.03, 0.02, 0.04, 0.04, 0.03, 0.03, 0.7])),
]:
    data = features.astype(np.float32).reshape(1, 21, 1, 1)
    sm_out = sess_sm.run(None, {"data": data})[0]
    md_out = sess_md.run(None, {"data": data})[0]
    print(f"  {name:12s}: LangSm={sm_out.flatten()}, LangMd={md_out.flatten()}")