"""Understand what the 21-dim input features are for LM models 11-32. These models take data[1,21,1,1] → softmax[1,2] (binary classifier). We need to figure out what 21 features to compute from the recognizer output.""" import onnx from onnx import numpy_helper import numpy as np from pathlib import Path import onnxruntime as ort # The 21 input features likely come from CTC recognizer statistics. # Let's test with the unlocked models using some hypothetical feature vectors. models_dir = Path("oneocr_extracted/onnx_models_unlocked") # Load a LangSm model (model_11 = Latin LangSm) sess_sm = ort.InferenceSession(str(list(models_dir.glob("model_11_*"))[0])) # Load a LangMd model (model_22 = Latin LangMd) sess_md = ort.InferenceSession(str(list(models_dir.glob("model_22_*"))[0])) print("LangSm (model_11) inputs:", [(i.name, i.shape, i.type) for i in sess_sm.get_inputs()]) print("LangSm (model_11) outputs:", [(o.name, o.shape, o.type) for o in sess_sm.get_outputs()]) print() print("LangMd (model_22) inputs:", [(i.name, i.shape, i.type) for i in sess_md.get_inputs()]) print("LangMd (model_22) outputs:", [(o.name, o.shape, o.type) for o in sess_md.get_outputs()]) # The normalization constants inside the model tell us about expected feature ranges # From earlier analysis: # Add constant: [-1.273, 0.396, 0.134, 0.151, 0.084, 0.346, 0.472, 0.435, # 0.346, 0.581, 0.312, 0.036, 0.045, 0.033, 0.026, 0.022, # 0.044, 0.038, 0.029, 0.031, 0.696] # Div constant: [0.641, 0.914, 0.377, 0.399, 0.302, 0.657, 0.814, 0.769, # 0.658, 0.878, 0.617, 0.153, 0.166, 0.137, 0.120, 0.108, # 0.132, 0.115, 0.105, 0.108, 0.385] # # This means typical feature ranges are: # feature[0]: mean = 1.273, std = 0.641 (large negative offset → feature is centered around 1.27) # feature[20]: mean = -0.696, std = 0.385 # # Features 0: Large range → possibly average log-probability or entropy # Features 1-10: Medium range → possibly per-class probabilities or scores # Features 11-20: Small range → possibly confidence statistics # Let's check: extract normalization params from model_11 model_11 = onnx.load(str(list(Path("oneocr_extracted/onnx_models").glob("model_11_*"))[0])) for node in model_11.graph.node: if node.op_type == "Constant": name = node.output[0] if name in ['26', '28']: # Add and Div constants for attr in node.attribute: if attr.type == 4: data = numpy_helper.to_array(attr.t) label = "Add (=-mean)" if name == '26' else "Div (=std)" print(f"\n{label}: {data.flatten()}") # The mean tells us the expected center of each feature if name == '26': # mean = -add_const means = -data.flatten() print(f" Implied means: {means}") # Hypothesis: The 21 features are CTC decoder statistics: # Based on the normalization centers (means): # feat[0]: ~1.27 — could be average negative log-likelihood (NLL) per character # feat[1]: ~-0.40 — could be a score # feat[2-10]: ~0-0.5 — could be per-script probabilities from ScriptID # feat[11-20]: ~0-0.04 — could be character-level statistics # Let's test what outputs the recognizer produces rec_path = list(Path("oneocr_extracted/onnx_models").glob("model_02_*"))[0] rec_sess = ort.InferenceSession(str(rec_path)) print(f"\nRecognizer (model_02) outputs:") for o in rec_sess.get_outputs(): print(f" {o.name}: {o.shape}") # Try running recognizer and computing statistics test_data = np.random.randn(1, 3, 60, 200).astype(np.float32) * 0.1 seq_lengths = np.array([50], dtype=np.int32) # 200/4 result = rec_sess.run(None, {"data": test_data, "seq_lengths": seq_lengths}) logprobs = result[0] print(f"\nRecognizer output: {logprobs.shape}") print(f" Log-prob range: [{logprobs.min():.4f}, {logprobs.max():.4f}]") # Compute possible features from recognizer output: lp = logprobs[:, 0, :] # [T, num_classes] best_probs = np.exp(lp.max(axis=-1)) # Best probability per frame mean_best = best_probs.mean() print(f"\n Mean best prob per frame: {mean_best:.4f}") print(f" Mean log-prob max: {lp.max(axis=-1).mean():.4f}") print(f" Entropy per frame: {(-np.exp(lp) * lp).sum(axis=-1).mean():.4f}") # The 21 features might be computed as: # feat[0] = average log-probability (NLL) → how confident the model is # feat[1..K] = character frequency statistics # feat[K+1..20] = transition statistics # # Without the exact feature computation code from the DLL, we'll need to # reverse-engineer or approximate the feature vector. # For now, test the LM models with various feature values print(f"\n--- Testing LM models with various inputs ---") for name, features in [ ("all_zeros", np.zeros(21)), ("high_conf", np.array([0.0, 0.5, 0.9, 0.9, 0.9, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1.0])), ("low_conf", np.array([3.0, -0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1])), ("typical", np.array([1.2, -0.4, 0.1, 0.15, 0.08, 0.35, 0.47, 0.43, 0.35, 0.58, 0.31, 0.04, 0.05, 0.03, 0.03, 0.02, 0.04, 0.04, 0.03, 0.03, 0.7])), ]: data = features.astype(np.float32).reshape(1, 21, 1, 1) sm_out = sess_sm.run(None, {"data": data})[0] md_out = sess_md.run(None, {"data": data})[0] print(f" {name:12s}: LangSm={sm_out.flatten()}, LangMd={md_out.flatten()}")