acul3
/

LightOnOCR-2-1B-ExecuTorch

+#!/usr/bin/env python3
+"""
+Phase 3b: Text Decoder Export for ExecuTorch
+Extracts language_model + lm_head into a standalone nn.Module
+with static KV cache tensors for torch.export compatibility.
+Architecture: Qwen3 decoder (28 layers, GQA 16/8 heads, head_dim=128)
+Fixed max_seq_len: 512
+"""
+import os
+import sys
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Model constants from config
+HIDDEN_SIZE = 1024
+NUM_LAYERS = 28
+NUM_HEADS = 16
+NUM_KV_HEADS = 8
+HEAD_DIM = 128
+INTERMEDIATE_SIZE = 3072
+VOCAB_SIZE = 151936
+MAX_SEQ_LEN = 4096
+RMS_EPS = 1e-6
+ROPE_THETA = 1000000.0
+NUM_KV_GROUPS = NUM_HEADS // NUM_KV_HEADS  # 2
+MODEL_DIR = "./models/LightOnOCR-2-1B"
+def rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float = RMS_EPS) -> torch.Tensor:
+    """Inline RMSNorm — avoids @use_kernel_forward_from_hub decorator."""
+    input_dtype = x.dtype
+    x = x.to(torch.float32)
+    variance = x.pow(2).mean(-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    return weight * x.to(input_dtype)
+def precompute_rope_freqs(max_seq_len: int, head_dim: int, theta: float = ROPE_THETA):
+    """Precompute RoPE cos/sin for all positions up to max_seq_len."""
+    freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim))
+    t = torch.arange(max_seq_len, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    cos = freqs.cos()
+    sin = freqs.sin()
+    # Duplicate for full head_dim: [seq_len, head_dim/2] -> [seq_len, head_dim]
+    cos = torch.cat([cos, cos], dim=-1)
+    sin = torch.cat([sin, sin], dim=-1)
+    return cos, sin  # [max_seq_len, head_dim]
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    """
+    Apply rotary position embeddings to query and key states.
+    q, k: [batch, num_heads, seq_len, head_dim]
+    cos, sin: [max_seq_len, head_dim]
+    position_ids: [batch, seq_len]
+    """
+    # Gather cos/sin for the given positions
+    cos = cos[position_ids].unsqueeze(1)  # [batch, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)  # [batch, 1, seq_len, head_dim]
+    # Rotate
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class Qwen3AttentionFixed(nn.Module):
+    """
+    Fixed Qwen3 attention with static KV cache, inline QK-norm, and
+    no dynamic dispatch. Designed for torch.export compatibility.
+    """
+    def __init__(self, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.scaling = HEAD_DIM ** -0.5
+        # Projections
+        self.q_proj = nn.Linear(HIDDEN_SIZE, NUM_HEADS * HEAD_DIM, bias=False)
+        self.k_proj = nn.Linear(HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False)
+        self.v_proj = nn.Linear(HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False)
+        self.o_proj = nn.Linear(NUM_HEADS * HEAD_DIM, HIDDEN_SIZE, bias=False)
+        # QK-norm weights (RMSNorm per head)
+        self.q_norm_weight = nn.Parameter(torch.ones(HEAD_DIM))
+        self.k_norm_weight = nn.Parameter(torch.ones(HEAD_DIM))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,      # [batch, seq_len, hidden_size]
+        cos: torch.Tensor,                 # [max_seq_len, head_dim]
+        sin: torch.Tensor,                 # [max_seq_len, head_dim]
+        position_ids: torch.Tensor,        # [batch, seq_len]
+        attention_mask: torch.Tensor,      # [batch, 1, seq_len, cache_len+seq_len]
+        k_cache: torch.Tensor,            # [batch, num_kv_heads, max_seq_len, head_dim]
+        v_cache: torch.Tensor,            # [batch, num_kv_heads, max_seq_len, head_dim]
+        cache_position: torch.Tensor,      # [seq_len] — positions to write into cache
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Returns (output, updated_k_cache, updated_v_cache)"""
+        batch, seq_len, _ = hidden_states.shape
+        # Project Q, K, V
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+        # Reshape: [batch, seq_len, num_heads, head_dim] -> [batch, num_heads, seq_len, head_dim]
+        q = q.view(batch, seq_len, NUM_HEADS, HEAD_DIM)
+        k = k.view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM)
+        v = v.view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM)
+        # Apply QK-norm (RMSNorm per head, inline)
+        q = rms_norm(q, self.q_norm_weight)
+        k = rms_norm(k, self.k_norm_weight)
+        q = q.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+        k = k.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        v = v.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        # Apply RoPE
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        # Update KV cache using scatter (index_put)
+        # cache_position: [seq_len] — the positions to update
+        # k_cache shape: [batch, num_kv_heads, max_seq_len, head_dim]
+        k_cache = k_cache.clone()
+        v_cache = v_cache.clone()
+        k_cache[:, :, cache_position, :] = k
+        v_cache[:, :, cache_position, :] = v
+        # Expand KV heads for GQA: repeat each KV head for its group of Q heads
+        cache_len = k_cache.shape[2]  # dynamic, works for any MAX_SEQ_LEN
+        k_expanded = k_cache.unsqueeze(2).expand(-1, -1, NUM_KV_GROUPS, -1, -1)
+        k_expanded = k_expanded.reshape(batch, NUM_HEADS, cache_len, HEAD_DIM)
+        v_expanded = v_cache.unsqueeze(2).expand(-1, -1, NUM_KV_GROUPS, -1, -1)
+        v_expanded = v_expanded.reshape(batch, NUM_HEADS, cache_len, HEAD_DIM)
+        # Attention: Q @ K^T / sqrt(head_dim)
+        attn_weights = torch.matmul(q, k_expanded.transpose(2, 3)) * self.scaling
+        # Apply attention mask
+        attn_weights = attn_weights + attention_mask
+        # Softmax
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        # Attention output
+        attn_output = torch.matmul(attn_weights, v_expanded)
+        # Reshape back: [batch, num_heads, seq_len, head_dim] -> [batch, seq_len, hidden_size]
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch, seq_len, -1)
+        # Output projection
+        attn_output = self.o_proj(attn_output)
+        return attn_output, k_cache, v_cache
+class Qwen3MLPFixed(nn.Module):
+    """Fixed Qwen3 MLP (SiLU gate + up projection)."""
+    def __init__(self):
+        super().__init__()
+        self.gate_proj = nn.Linear(HIDDEN_SIZE, INTERMEDIATE_SIZE, bias=False)
+        self.up_proj = nn.Linear(HIDDEN_SIZE, INTERMEDIATE_SIZE, bias=False)
+        self.down_proj = nn.Linear(INTERMEDIATE_SIZE, HIDDEN_SIZE, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class Qwen3DecoderLayerFixed(nn.Module):
+    """Fixed Qwen3 decoder layer with static KV cache."""
+    def __init__(self, layer_idx: int):
+        super().__init__()
+        self.self_attn = Qwen3AttentionFixed(layer_idx)
+        self.mlp = Qwen3MLPFixed()
+        self.input_layernorm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))
+        self.post_attention_layernorm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Pre-norm + self attention
+        residual = hidden_states
+        hidden_states = rms_norm(hidden_states, self.input_layernorm_weight)
+        hidden_states, k_cache, v_cache = self.self_attn(
+            hidden_states, cos, sin, position_ids, attention_mask,
+            k_cache, v_cache, cache_position
+        )
+        hidden_states = residual + hidden_states
+        # Pre-norm + MLP
+        residual = hidden_states
+        hidden_states = rms_norm(hidden_states, self.post_attention_layernorm_weight)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, k_cache, v_cache
+class TextDecoderFixed(nn.Module):
+    """
+    Complete text decoder for ExecuTorch export.
+    Includes embedding, all decoder layers with static KV cache, and LM head.
+    For prefill: input_ids has seq_len > 1, cache_position starts at 0
+    For decode: input_ids has seq_len = 1, cache_position = current position
+    """
+    def __init__(self):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
+        self.layers = nn.ModuleList([
+            Qwen3DecoderLayerFixed(i) for i in range(NUM_LAYERS)
+        ])
+        self.norm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))
+        self.lm_head = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE, bias=False)
+        # Pre-compute RoPE frequencies
+        cos, sin = precompute_rope_freqs(MAX_SEQ_LEN, HEAD_DIM, ROPE_THETA)
+        self.register_buffer("rope_cos", cos)
+        self.register_buffer("rope_sin", sin)
+    def forward(
+        self,
+        input_ids: torch.Tensor,          # [batch, seq_len]
+        attention_mask: torch.Tensor,      # [batch, 1, seq_len, max_seq_len]
+        position_ids: torch.Tensor,        # [batch, seq_len]
+        cache_position: torch.Tensor,      # [seq_len]
+        *kv_caches: torch.Tensor,          # 28 * (k_cache, v_cache) flattened
+    ) -> tuple:
+        """
+        Returns: (logits, *updated_kv_caches)
+        kv_caches: 56 tensors total (28 layers * 2 for k,v)
+        Each cache: [batch, num_kv_heads, max_seq_len, head_dim]
+        """
+        # Embed tokens
+        hidden_states = self.embed_tokens(input_ids)
+        # Process through all layers, updating KV caches
+        updated_caches = []
+        for i, layer in enumerate(self.layers):
+            k_cache = kv_caches[i * 2]
+            v_cache = kv_caches[i * 2 + 1]
+            hidden_states, new_k, new_v = layer(
+                hidden_states,
+                self.rope_cos, self.rope_sin,
+                position_ids, attention_mask,
+                k_cache, v_cache, cache_position
+            )
+            updated_caches.append(new_k)
+            updated_caches.append(new_v)
+        # Final norm
+        hidden_states = rms_norm(hidden_states, self.norm_weight)
+        # LM head — only compute logits for the last token
+        logits = self.lm_head(hidden_states[:, -1:, :])  # [batch, 1, vocab_size]
+        return (logits, *updated_caches)
+def load_original_model():
+    """Load the original model with proper weight remapping."""
+    from transformers import AutoModelForImageTextToText
+    from safetensors.torch import load_file
+    print("Loading original model...")
+    model = AutoModelForImageTextToText.from_pretrained(
+        MODEL_DIR,
+        dtype=torch.bfloat16,
+        attn_implementation="sdpa",
+        device_map="cpu",
+    )
+    state_dict = load_file(os.path.join(MODEL_DIR, "model.safetensors"))
+    remapped = {}
+    for k, v in state_dict.items():
+        new_k = k.replace("model.vision_encoder.", "model.vision_tower.")
+        new_k = new_k.replace("model.vision_projection.", "model.multi_modal_projector.")
+        remapped[new_k] = v
+    model.load_state_dict(remapped, strict=False)
+    return model
+def build_decoder_module(original_model):
+    """Build the fixed decoder module from the original model's weights."""
+    print("\nBuilding fixed text decoder...")
+    orig_lm = original_model.model.language_model
+    orig_lm_head = original_model.lm_head
+    decoder = TextDecoderFixed()
+    # Copy embedding weights
+    decoder.embed_tokens.weight.data.copy_(orig_lm.embed_tokens.weight.data)
+    # Copy final norm weight
+    decoder.norm_weight.data.copy_(orig_lm.norm.weight.data)
+    # Copy LM head (tied with embeddings)
+    decoder.lm_head.weight.data.copy_(orig_lm.embed_tokens.weight.data)
+    # Copy layer weights
+    for i in range(NUM_LAYERS):
+        orig_layer = orig_lm.layers[i]
+        fixed_layer = decoder.layers[i]
+        # Attention projections
+        fixed_layer.self_attn.q_proj.weight.data.copy_(orig_layer.self_attn.q_proj.weight.data)
+        fixed_layer.self_attn.k_proj.weight.data.copy_(orig_layer.self_attn.k_proj.weight.data)
+        fixed_layer.self_attn.v_proj.weight.data.copy_(orig_layer.self_attn.v_proj.weight.data)
+        fixed_layer.self_attn.o_proj.weight.data.copy_(orig_layer.self_attn.o_proj.weight.data)
+        # QK-norm weights
+        fixed_layer.self_attn.q_norm_weight.data.copy_(orig_layer.self_attn.q_norm.weight.data)
+        fixed_layer.self_attn.k_norm_weight.data.copy_(orig_layer.self_attn.k_norm.weight.data)
+        # Layer norms
+        fixed_layer.input_layernorm_weight.data.copy_(orig_layer.input_layernorm.weight.data)
+        fixed_layer.post_attention_layernorm_weight.data.copy_(orig_layer.post_attention_layernorm.weight.data)
+        # MLP
+        fixed_layer.mlp.gate_proj.weight.data.copy_(orig_layer.mlp.gate_proj.weight.data)
+        fixed_layer.mlp.up_proj.weight.data.copy_(orig_layer.mlp.up_proj.weight.data)
+        fixed_layer.mlp.down_proj.weight.data.copy_(orig_layer.mlp.down_proj.weight.data)
+    decoder.eval()
+    total_params = sum(p.numel() for p in decoder.parameters())
+    print(f"  Decoder parameters: {total_params/1e6:.2f}M")
+    return decoder
+def create_empty_kv_caches(batch_size: int = 1, dtype=torch.float32, device="cpu"):
+    """Create empty KV cache tensors for all layers."""
+    caches = []
+    for _ in range(NUM_LAYERS):
+        k = torch.zeros(batch_size, NUM_KV_HEADS, MAX_SEQ_LEN, HEAD_DIM, dtype=dtype, device=device)
+        v = torch.zeros(batch_size, NUM_KV_HEADS, MAX_SEQ_LEN, HEAD_DIM, dtype=dtype, device=device)
+        caches.extend([k, v])
+    return tuple(caches)
+def create_causal_mask(seq_len: int, cache_len: int = MAX_SEQ_LEN, dtype=torch.float32):
+    """Create causal attention mask."""
+    mask = torch.full((seq_len, cache_len), float("-inf"), dtype=dtype)
+    mask = torch.triu(mask, diagonal=cache_len - seq_len + 1)
+    return mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, cache_len]
+def test_decoder_module(decoder, original_model):
+    """Test that the fixed decoder produces same output as original."""
+    print("\nTesting decoder output consistency...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    decoder = decoder.to(device).to(torch.bfloat16)
+    original_model = original_model.to(device)
+    # Test input
+    input_ids = torch.tensor([[1, 2, 3, 4, 5]], device=device)
+    seq_len = input_ids.shape[1]
+    position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
+    cache_position = torch.arange(seq_len, device=device)
+    # Causal mask
+    mask = create_causal_mask(seq_len, dtype=torch.bfloat16).to(device)
+    # Empty KV caches
+    kv_caches = create_empty_kv_caches(1, torch.bfloat16, device)
+    with torch.no_grad():
+        # Fixed decoder
+        result = decoder(input_ids, mask, position_ids, cache_position, *kv_caches)
+        fixed_logits = result[0]
+        print(f"  Fixed decoder output shape: {fixed_logits.shape}")
+        # Original model (text-only, no image)
+        orig_outputs = original_model(
+            input_ids=input_ids,
+            attention_mask=torch.ones_like(input_ids),
+            use_cache=False,
+        )
+        orig_logits = orig_outputs.logits[:, -1:, :]
+        print(f"  Original model output shape: {orig_logits.shape}")
+        # Compare
+        diff = (fixed_logits.float() - orig_logits.float()).abs()
+        print(f"  Max absolute difference: {diff.max().item():.6f}")
+        print(f"  Mean absolute difference: {diff.mean().item():.6f}")
+        # Check top-k predictions match
+        fixed_topk = fixed_logits.float().topk(5, dim=-1)
+        orig_topk = orig_logits.float().topk(5, dim=-1)
+        print(f"  Fixed top-5 token IDs: {fixed_topk.indices[0, 0].tolist()}")
+        print(f"  Original top-5 token IDs: {orig_topk.indices[0, 0].tolist()}")
+        matching = sum(1 for t in fixed_topk.indices[0, 0].tolist() if t in orig_topk.indices[0, 0].tolist())
+        print(f"  Top-5 overlap: {matching}/5")
+def try_torch_export(decoder):
+    """Attempt torch.export.export() on the decoder."""
+    print("\n" + "=" * 60)
+    print("ATTEMPTING torch.export.export() on decoder")
+    print("=" * 60)
+    # Export on CPU with float32 for XNNPACK
+    decoder = decoder.to("cpu").to(torch.float32)
+    decoder.eval()
+    batch_size = 1
+    seq_len = 1  # Export for single-token decode step (simpler)
+    input_ids = torch.randint(0, VOCAB_SIZE, (batch_size, seq_len))
+    attention_mask = create_causal_mask(seq_len, MAX_SEQ_LEN, torch.float32)
+    position_ids = torch.zeros(batch_size, seq_len, dtype=torch.long)
+    cache_position = torch.zeros(seq_len, dtype=torch.long)
+    kv_caches = create_empty_kv_caches(batch_size, torch.float32, "cpu")
+    example_args = (input_ids, attention_mask, position_ids, cache_position, *kv_caches)
+    try:
+        print(f"  Exporting with seq_len={seq_len}, max_cache={MAX_SEQ_LEN}...")
+        print(f"  Number of input tensors: {len(example_args)} (4 + {NUM_LAYERS}*2 KV caches)")
+        exported = torch.export.export(
+            decoder,
+            example_args,
+            strict=False,
+        )
+        print("  SUCCESS! torch.export completed!")
+        return exported
+    except Exception as e:
+        print(f"  FAILED: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+        # Try with trace as fallback
+        print("\n  Trying torch.jit.trace as fallback...")
+        try:
+            traced = torch.jit.trace(decoder, example_args)
+            print("  torch.jit.trace succeeded!")
+            return traced
+        except Exception as e2:
+            print(f"  torch.jit.trace also failed: {type(e2).__name__}: {e2}")
+        return None
+def export_to_pte(exported_model):
+    """Convert exported model to .pte using XNNPACK backend."""
+    print("\n" + "=" * 60)
+    print("EXPORTING DECODER TO .pte (XNNPACK)")
+    print("=" * 60)
+    try:
+        from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
+        from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+        if not hasattr(exported_model, 'graph_module'):
+            print("  Need torch.export.export() result for .pte export")
+            return None
+        print("  Running to_edge_transform_and_lower...")
+        edge = to_edge_transform_and_lower(
+            exported_model,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+            partitioner=[XnnpackPartitioner()],
+        )
+        print("  Running to_executorch()...")
+        pte = edge.to_executorch()
+        output_path = "text_decoder.pte"
+        with open(output_path, "wb") as f:
+            f.write(pte.buffer)
+        file_size = os.path.getsize(output_path) / (1024 * 1024)
+        print(f"  Saved to {output_path} ({file_size:.1f} MB)")
+        return output_path
+    except ImportError as e:
+        print(f"  ExecuTorch import failed: {e}")
+        return None
+    except Exception as e:
+        print(f"  Export failed: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def main():
+    print("=" * 60)
+    print("Text Decoder Export for ExecuTorch")
+    print(f"Architecture: Qwen3 {NUM_LAYERS}L, {NUM_HEADS}H/{NUM_KV_HEADS}KV, dim={HIDDEN_SIZE}")
+    print(f"Max seq len: {MAX_SEQ_LEN}")
+    print(f"KV cache size per layer: {NUM_KV_HEADS}x{MAX_SEQ_LEN}x{HEAD_DIM} = {NUM_KV_HEADS*MAX_SEQ_LEN*HEAD_DIM/1e6:.2f}M elements")
+    print("=" * 60)
+    # Load original model
+    original_model = load_original_model()
+    # Build fixed decoder
+    decoder = build_decoder_module(original_model)
+    # Test consistency
+    test_decoder_module(decoder, original_model)
+    # Free original model memory
+    del original_model
+    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+    # Try torch.export
+    exported = try_torch_export(decoder)
+    if exported is not None:
+        export_to_pte(exported)
+    # Save the PyTorch module for later use
+    torch.save(decoder.state_dict(), "text_decoder_fixed.pt")
+    print(f"\nSaved fixed decoder state dict to text_decoder_fixed.pt")
+    print("Decoder export script complete!")
+if __name__ == "__main__":
+    main()