"""Validate the cracked OneOCRFeatureExtract structure by replacing the op with standard Gemm."""
import onnx
from onnx import numpy_helper, TensorProto
import numpy as np
from pathlib import Path
import onnxruntime as ort

models_dir = Path("oneocr_extracted/onnx_models")

# Load model_11
model_path = list(models_dir.glob("model_11_*"))[0]
model = onnx.load(str(model_path))

# Get config blob (big-endian float32)
for init in model.graph.initializer:
    if init.name == "feature/config":
        blob = bytes(init.string_data[0])
        break

be_arr = np.frombuffer(blob, dtype='>f4').copy()  # big-endian → native
print(f"Config blob: {len(be_arr)} floats total")
print(f"First 30 values: {be_arr[:30]}")

# Let's analyze the structure systematically
# The OneOCRFeatureExtract takes 21-dim input and produces 50-dim output
# So we expect a 21→50 transformation

# Method 1: Try W[21×50] + b[50] starting from the beginning (no header)
# 21*50 = 1050, 1050+50 = 1100, remaining = 23
W_0 = be_arr[:1050].reshape(21, 50)
b_0 = be_arr[1050:1100]
tail = be_arr[1100:]
print(f"\n--- No header: W[21×50] + b[50] + tail[{len(tail)}] ---")
print(f"  W: range=[{W_0.min():.4f}, {W_0.max():.4f}], mean={W_0.mean():.4f}, std={W_0.std():.4f}")
print(f"  b: range=[{b_0.min():.4f}, {b_0.max():.4f}], mean={b_0.mean():.4f}")
print(f"  tail: {tail}")

# Method 2: Try W[50×21] (transposed) + b[50] 
W_t = be_arr[:1050].reshape(50, 21)
b_t = be_arr[1050:1100]
print(f"\n--- No header: W[50×21] + b[50] + tail[{len(tail)}] ---")
print(f"  W: range=[{W_t.min():.4f}, {W_t.max():.4f}], mean={W_t.mean():.4f}, std={W_t.std():.4f}")
print(f"  b: range=[{b_t.min():.4f}, {b_t.max():.4f}], mean={b_t.mean():.4f}")

# Method 3: header=23 + W[21×50] + b[50]
header = be_arr[:23]
W_h = be_arr[23:23+1050].reshape(21, 50)
b_h = be_arr[23+1050:]
print(f"\n--- Header=23: W[21×50] + b[50] ---")
print(f"  Header: {header}")
print(f"  W: range=[{W_h.min():.4f}, {W_h.max():.4f}], mean={W_h.mean():.4f}, std={W_h.std():.4f}")
print(f"  b: range=[{b_h.min():.4f}, {b_h.max():.4f}], mean={b_h.mean():.4f}")

# Check where the large values are
print(f"\n--- Values > 10 ---")
for i, v in enumerate(be_arr):
    if abs(v) > 10:
        print(f"  [{i}] = {v}")

# Check if tail/header might be something meaningful
# 23 values: could be normalization params (21 dim + 2 extras?)
# Or dimensions metadata

# Now try to build a replacement model
# The original graph:
# data[1,21,1,1] → Reshape → Slice[0:21] → Add(offset) → Div(scale) → OneOCRFeatureExtract → [50]
# → Gemm(50,50) → Relu → Gemm(50,50) → Relu → Gemm(50,2) → Softmax

# We'll replace OneOCRFeatureExtract with a standard Gemm
# Let's try all 3 weight interpretations

# Get normalization constants
add_const = None
div_const = None
for node in model.graph.node:
    if node.op_type == "Constant":
        name = node.output[0]
        for attr in node.attribute:
            if attr.type == 4:
                t = attr.t
                data = numpy_helper.to_array(t)
                if name == '26':  # Add constant
                    add_const = data
                elif name == '28':  # Div constant
                    div_const = data

print(f"\nNormalization: add={add_const.shape}, div={div_const.shape}")

# Test with sample input
test_input = np.random.randn(1, 21, 1, 1).astype(np.float32)

# Simulate the preprocessing
x = test_input.reshape(1, 21)[:, :21]  # Slice
x = (x + add_const) / div_const  # Normalize

# Apply feature extraction for each method
for name, W, b in [("no_header_21x50", W_0, b_0), 
                     ("no_header_50x21_T", W_t.T, b_t),
                     ("header23_21x50", W_h, b_h)]:
    feat = x @ W + b  # [1, 50]
    print(f"\n{name}:")
    print(f"  Feature output: range=[{feat.min():.4f}, {feat.max():.4f}], mean={feat.mean():.4f}")
    
    # Continue through the MLP
    for init in model.graph.initializer:
        if init.name == "learned_2":
            W2 = numpy_helper.to_array(init)
        elif init.name == "learned_3":
            b2 = numpy_helper.to_array(init)
        elif init.name == "1.layers.5.weight":
            W5 = numpy_helper.to_array(init)
        elif init.name == "1.layers.5.bias":
            b5 = numpy_helper.to_array(init)
        elif init.name == "1.layers.7.weight":
            W7 = numpy_helper.to_array(init)
        elif init.name == "1.layers.7.bias":
            b7 = numpy_helper.to_array(init)
    
    h1 = np.maximum(0, feat @ W2.T + b2)  # Gemm + Relu
    h2 = np.maximum(0, h1 @ W5.T + b5)    # Gemm + Relu
    logits = h2 @ W7.T + b7                # Gemm
    probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)  # Softmax
    print(f"  Final softmax: {probs}")