oneocr / _archive /analyze_lm_features.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 about 21 hours ago

5.55 kB

	"""Understand what the 21-dim input features are for LM models 11-32.
	These models take data[1,21,1,1] → softmax[1,2] (binary classifier).
	We need to figure out what 21 features to compute from the recognizer output."""
	import onnx
	from onnx import numpy_helper
	import numpy as np
	from pathlib import Path
	import onnxruntime as ort

	# The 21 input features likely come from CTC recognizer statistics.
	# Let's test with the unlocked models using some hypothetical feature vectors.

	models_dir = Path("oneocr_extracted/onnx_models_unlocked")

	# Load a LangSm model (model_11 = Latin LangSm)
	sess_sm = ort.InferenceSession(str(list(models_dir.glob("model_11_*"))[0]))
	# Load a LangMd model (model_22 = Latin LangMd)
	sess_md = ort.InferenceSession(str(list(models_dir.glob("model_22_*"))[0]))

	print("LangSm (model_11) inputs:", [(i.name, i.shape, i.type) for i in sess_sm.get_inputs()])
	print("LangSm (model_11) outputs:", [(o.name, o.shape, o.type) for o in sess_sm.get_outputs()])
	print()
	print("LangMd (model_22) inputs:", [(i.name, i.shape, i.type) for i in sess_md.get_inputs()])
	print("LangMd (model_22) outputs:", [(o.name, o.shape, o.type) for o in sess_md.get_outputs()])

	# The normalization constants inside the model tell us about expected feature ranges
	# From earlier analysis:
	# Add constant: [-1.273, 0.396, 0.134, 0.151, 0.084, 0.346, 0.472, 0.435,
	# 0.346, 0.581, 0.312, 0.036, 0.045, 0.033, 0.026, 0.022,
	# 0.044, 0.038, 0.029, 0.031, 0.696]
	# Div constant: [0.641, 0.914, 0.377, 0.399, 0.302, 0.657, 0.814, 0.769,
	# 0.658, 0.878, 0.617, 0.153, 0.166, 0.137, 0.120, 0.108,
	# 0.132, 0.115, 0.105, 0.108, 0.385]
	#
	# This means typical feature ranges are:
	# feature[0]: mean = 1.273, std = 0.641 (large negative offset → feature is centered around 1.27)
	# feature[20]: mean = -0.696, std = 0.385
	#
	# Features 0: Large range → possibly average log-probability or entropy
	# Features 1-10: Medium range → possibly per-class probabilities or scores
	# Features 11-20: Small range → possibly confidence statistics

	# Let's check: extract normalization params from model_11
	model_11 = onnx.load(str(list(Path("oneocr_extracted/onnx_models").glob("model_11_*"))[0]))

	for node in model_11.graph.node:
	if node.op_type == "Constant":
	name = node.output[0]
	if name in ['26', '28']: # Add and Div constants
	for attr in node.attribute:
	if attr.type == 4:
	data = numpy_helper.to_array(attr.t)
	label = "Add (=-mean)" if name == '26' else "Div (=std)"
	print(f"\n{label}: {data.flatten()}")
	# The mean tells us the expected center of each feature
	if name == '26':
	# mean = -add_const
	means = -data.flatten()
	print(f" Implied means: {means}")

	# Hypothesis: The 21 features are CTC decoder statistics:
	# Based on the normalization centers (means):
	# feat[0]: ~1.27 — could be average negative log-likelihood (NLL) per character
	# feat[1]: ~-0.40 — could be a score
	# feat[2-10]: ~0-0.5 — could be per-script probabilities from ScriptID
	# feat[11-20]: ~0-0.04 — could be character-level statistics

	# Let's test what outputs the recognizer produces
	rec_path = list(Path("oneocr_extracted/onnx_models").glob("model_02_*"))[0]
	rec_sess = ort.InferenceSession(str(rec_path))
	print(f"\nRecognizer (model_02) outputs:")
	for o in rec_sess.get_outputs():
	print(f" {o.name}: {o.shape}")

	# Try running recognizer and computing statistics
	test_data = np.random.randn(1, 3, 60, 200).astype(np.float32) * 0.1
	seq_lengths = np.array([50], dtype=np.int32) # 200/4
	result = rec_sess.run(None, {"data": test_data, "seq_lengths": seq_lengths})
	logprobs = result[0]
	print(f"\nRecognizer output: {logprobs.shape}")
	print(f" Log-prob range: [{logprobs.min():.4f}, {logprobs.max():.4f}]")

	# Compute possible features from recognizer output:
	lp = logprobs[:, 0, :] # [T, num_classes]
	best_probs = np.exp(lp.max(axis=-1)) # Best probability per frame
	mean_best = best_probs.mean()
	print(f"\n Mean best prob per frame: {mean_best:.4f}")
	print(f" Mean log-prob max: {lp.max(axis=-1).mean():.4f}")
	print(f" Entropy per frame: {(-np.exp(lp) * lp).sum(axis=-1).mean():.4f}")

	# The 21 features might be computed as:
	# feat[0] = average log-probability (NLL) → how confident the model is
	# feat[1..K] = character frequency statistics
	# feat[K+1..20] = transition statistics
	#
	# Without the exact feature computation code from the DLL, we'll need to
	# reverse-engineer or approximate the feature vector.

	# For now, test the LM models with various feature values
	print(f"\n--- Testing LM models with various inputs ---")
	for name, features in [
	("all_zeros", np.zeros(21)),
	("high_conf", np.array([0.0, 0.5, 0.9, 0.9, 0.9, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 1.0])),
	("low_conf", np.array([3.0, -0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.1])),
	("typical", np.array([1.2, -0.4, 0.1, 0.15, 0.08, 0.35, 0.47, 0.43, 0.35, 0.58, 0.31, 0.04, 0.05, 0.03, 0.03, 0.02, 0.04, 0.04, 0.03, 0.03, 0.7])),
	]:
	data = features.astype(np.float32).reshape(1, 21, 1, 1)
	sm_out = sess_sm.run(None, {"data": data})[0]
	md_out = sess_md.run(None, {"data": data})[0]
	print(f" {name:12s}: LangSm={sm_out.flatten()}, LangMd={md_out.flatten()}")