Upload scripts/export_decoder.py with huggingface_hub

77cf118 verified 11 days ago

20.8 kB

	#!/usr/bin/env python3
	"""
	Phase 3b: Text Decoder Export for ExecuTorch
	Extracts language_model + lm_head into a standalone nn.Module
	with static KV cache tensors for torch.export compatibility.

	Architecture: Qwen3 decoder (28 layers, GQA 16/8 heads, head_dim=128)
	Fixed max_seq_len: 512
	"""

	import os
	import sys
	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# Model constants from config
	HIDDEN_SIZE = 1024
	NUM_LAYERS = 28
	NUM_HEADS = 16
	NUM_KV_HEADS = 8
	HEAD_DIM = 128
	INTERMEDIATE_SIZE = 3072
	VOCAB_SIZE = 151936
	MAX_SEQ_LEN = 4096
	RMS_EPS = 1e-6
	ROPE_THETA = 1000000.0
	NUM_KV_GROUPS = NUM_HEADS // NUM_KV_HEADS # 2

	MODEL_DIR = "./models/LightOnOCR-2-1B"


	def rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float = RMS_EPS) -> torch.Tensor:
	"""Inline RMSNorm — avoids @use_kernel_forward_from_hub decorator."""
	input_dtype = x.dtype
	x = x.to(torch.float32)
	variance = x.pow(2).mean(-1, keepdim=True)
	x = x * torch.rsqrt(variance + eps)
	return weight * x.to(input_dtype)


	def precompute_rope_freqs(max_seq_len: int, head_dim: int, theta: float = ROPE_THETA):
	"""Precompute RoPE cos/sin for all positions up to max_seq_len."""
	freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim))
	t = torch.arange(max_seq_len, dtype=torch.float32)
	freqs = torch.outer(t, freqs)
	cos = freqs.cos()
	sin = freqs.sin()
	# Duplicate for full head_dim: [seq_len, head_dim/2] -> [seq_len, head_dim]
	cos = torch.cat([cos, cos], dim=-1)
	sin = torch.cat([sin, sin], dim=-1)
	return cos, sin # [max_seq_len, head_dim]


	def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
	"""
	Apply rotary position embeddings to query and key states.
	q, k: [batch, num_heads, seq_len, head_dim]
	cos, sin: [max_seq_len, head_dim]
	position_ids: [batch, seq_len]
	"""
	# Gather cos/sin for the given positions
	cos = cos[position_ids].unsqueeze(1) # [batch, 1, seq_len, head_dim]
	sin = sin[position_ids].unsqueeze(1) # [batch, 1, seq_len, head_dim]

	# Rotate
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	class Qwen3AttentionFixed(nn.Module):
	"""
	Fixed Qwen3 attention with static KV cache, inline QK-norm, and
	no dynamic dispatch. Designed for torch.export compatibility.
	"""

	def __init__(self, layer_idx: int):
	super().__init__()
	self.layer_idx = layer_idx
	self.scaling = HEAD_DIM ** -0.5

	# Projections
	self.q_proj = nn.Linear(HIDDEN_SIZE, NUM_HEADS * HEAD_DIM, bias=False)
	self.k_proj = nn.Linear(HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False)
	self.v_proj = nn.Linear(HIDDEN_SIZE, NUM_KV_HEADS * HEAD_DIM, bias=False)
	self.o_proj = nn.Linear(NUM_HEADS * HEAD_DIM, HIDDEN_SIZE, bias=False)

	# QK-norm weights (RMSNorm per head)
	self.q_norm_weight = nn.Parameter(torch.ones(HEAD_DIM))
	self.k_norm_weight = nn.Parameter(torch.ones(HEAD_DIM))

	def forward(
	self,
	hidden_states: torch.Tensor, # [batch, seq_len, hidden_size]
	cos: torch.Tensor, # [max_seq_len, head_dim]
	sin: torch.Tensor, # [max_seq_len, head_dim]
	position_ids: torch.Tensor, # [batch, seq_len]
	attention_mask: torch.Tensor, # [batch, 1, seq_len, cache_len+seq_len]
	k_cache: torch.Tensor, # [batch, num_kv_heads, max_seq_len, head_dim]
	v_cache: torch.Tensor, # [batch, num_kv_heads, max_seq_len, head_dim]
	cache_position: torch.Tensor, # [seq_len] — positions to write into cache
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Returns (output, updated_k_cache, updated_v_cache)"""
	batch, seq_len, _ = hidden_states.shape

	# Project Q, K, V
	q = self.q_proj(hidden_states)
	k = self.k_proj(hidden_states)
	v = self.v_proj(hidden_states)

	# Reshape: [batch, seq_len, num_heads, head_dim] -> [batch, num_heads, seq_len, head_dim]
	q = q.view(batch, seq_len, NUM_HEADS, HEAD_DIM)
	k = k.view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM)
	v = v.view(batch, seq_len, NUM_KV_HEADS, HEAD_DIM)

	# Apply QK-norm (RMSNorm per head, inline)
	q = rms_norm(q, self.q_norm_weight)
	k = rms_norm(k, self.k_norm_weight)

	q = q.transpose(1, 2) # [batch, num_heads, seq_len, head_dim]
	k = k.transpose(1, 2) # [batch, num_kv_heads, seq_len, head_dim]
	v = v.transpose(1, 2) # [batch, num_kv_heads, seq_len, head_dim]

	# Apply RoPE
	q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)

	# Update KV cache using scatter (index_put)
	# cache_position: [seq_len] — the positions to update
	# k_cache shape: [batch, num_kv_heads, max_seq_len, head_dim]
	k_cache = k_cache.clone()
	v_cache = v_cache.clone()
	k_cache[:, :, cache_position, :] = k
	v_cache[:, :, cache_position, :] = v

	# Expand KV heads for GQA: repeat each KV head for its group of Q heads
	cache_len = k_cache.shape[2] # dynamic, works for any MAX_SEQ_LEN
	k_expanded = k_cache.unsqueeze(2).expand(-1, -1, NUM_KV_GROUPS, -1, -1)
	k_expanded = k_expanded.reshape(batch, NUM_HEADS, cache_len, HEAD_DIM)
	v_expanded = v_cache.unsqueeze(2).expand(-1, -1, NUM_KV_GROUPS, -1, -1)
	v_expanded = v_expanded.reshape(batch, NUM_HEADS, cache_len, HEAD_DIM)

	# Attention: Q @ K^T / sqrt(head_dim)
	attn_weights = torch.matmul(q, k_expanded.transpose(2, 3)) * self.scaling

	# Apply attention mask
	attn_weights = attn_weights + attention_mask

	# Softmax
	attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)

	# Attention output
	attn_output = torch.matmul(attn_weights, v_expanded)

	# Reshape back: [batch, num_heads, seq_len, head_dim] -> [batch, seq_len, hidden_size]
	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(batch, seq_len, -1)

	# Output projection
	attn_output = self.o_proj(attn_output)

	return attn_output, k_cache, v_cache


	class Qwen3MLPFixed(nn.Module):
	"""Fixed Qwen3 MLP (SiLU gate + up projection)."""

	def __init__(self):
	super().__init__()
	self.gate_proj = nn.Linear(HIDDEN_SIZE, INTERMEDIATE_SIZE, bias=False)
	self.up_proj = nn.Linear(HIDDEN_SIZE, INTERMEDIATE_SIZE, bias=False)
	self.down_proj = nn.Linear(INTERMEDIATE_SIZE, HIDDEN_SIZE, bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))


	class Qwen3DecoderLayerFixed(nn.Module):
	"""Fixed Qwen3 decoder layer with static KV cache."""

	def __init__(self, layer_idx: int):
	super().__init__()
	self.self_attn = Qwen3AttentionFixed(layer_idx)
	self.mlp = Qwen3MLPFixed()
	self.input_layernorm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))
	self.post_attention_layernorm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))

	def forward(
	self,
	hidden_states: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	position_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	k_cache: torch.Tensor,
	v_cache: torch.Tensor,
	cache_position: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	# Pre-norm + self attention
	residual = hidden_states
	hidden_states = rms_norm(hidden_states, self.input_layernorm_weight)
	hidden_states, k_cache, v_cache = self.self_attn(
	hidden_states, cos, sin, position_ids, attention_mask,
	k_cache, v_cache, cache_position
	)
	hidden_states = residual + hidden_states

	# Pre-norm + MLP
	residual = hidden_states
	hidden_states = rms_norm(hidden_states, self.post_attention_layernorm_weight)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states, k_cache, v_cache


	class TextDecoderFixed(nn.Module):
	"""
	Complete text decoder for ExecuTorch export.
	Includes embedding, all decoder layers with static KV cache, and LM head.

	For prefill: input_ids has seq_len > 1, cache_position starts at 0
	For decode: input_ids has seq_len = 1, cache_position = current position
	"""

	def __init__(self):
	super().__init__()
	self.embed_tokens = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
	self.layers = nn.ModuleList([
	Qwen3DecoderLayerFixed(i) for i in range(NUM_LAYERS)
	])
	self.norm_weight = nn.Parameter(torch.ones(HIDDEN_SIZE))
	self.lm_head = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE, bias=False)

	# Pre-compute RoPE frequencies
	cos, sin = precompute_rope_freqs(MAX_SEQ_LEN, HEAD_DIM, ROPE_THETA)
	self.register_buffer("rope_cos", cos)
	self.register_buffer("rope_sin", sin)

	def forward(
	self,
	input_ids: torch.Tensor, # [batch, seq_len]
	attention_mask: torch.Tensor, # [batch, 1, seq_len, max_seq_len]
	position_ids: torch.Tensor, # [batch, seq_len]
	cache_position: torch.Tensor, # [seq_len]
	kv_caches: torch.Tensor, # 28 (k_cache, v_cache) flattened
	) -> tuple:
	"""
	Returns: (logits, *updated_kv_caches)
	kv_caches: 56 tensors total (28 layers * 2 for k,v)
	Each cache: [batch, num_kv_heads, max_seq_len, head_dim]
	"""
	# Embed tokens
	hidden_states = self.embed_tokens(input_ids)

	# Process through all layers, updating KV caches
	updated_caches = []
	for i, layer in enumerate(self.layers):
	k_cache = kv_caches[i * 2]
	v_cache = kv_caches[i * 2 + 1]
	hidden_states, new_k, new_v = layer(
	hidden_states,
	self.rope_cos, self.rope_sin,
	position_ids, attention_mask,
	k_cache, v_cache, cache_position
	)
	updated_caches.append(new_k)
	updated_caches.append(new_v)

	# Final norm
	hidden_states = rms_norm(hidden_states, self.norm_weight)

	# LM head — only compute logits for the last token
	logits = self.lm_head(hidden_states[:, -1:, :]) # [batch, 1, vocab_size]

	return (logits, *updated_caches)


	def load_original_model():
	"""Load the original model with proper weight remapping."""
	from transformers import AutoModelForImageTextToText
	from safetensors.torch import load_file

	print("Loading original model...")
	model = AutoModelForImageTextToText.from_pretrained(
	MODEL_DIR,
	dtype=torch.bfloat16,
	attn_implementation="sdpa",
	device_map="cpu",
	)

	state_dict = load_file(os.path.join(MODEL_DIR, "model.safetensors"))
	remapped = {}
	for k, v in state_dict.items():
	new_k = k.replace("model.vision_encoder.", "model.vision_tower.")
	new_k = new_k.replace("model.vision_projection.", "model.multi_modal_projector.")
	remapped[new_k] = v
	model.load_state_dict(remapped, strict=False)

	return model


	def build_decoder_module(original_model):
	"""Build the fixed decoder module from the original model's weights."""
	print("\nBuilding fixed text decoder...")

	orig_lm = original_model.model.language_model
	orig_lm_head = original_model.lm_head

	decoder = TextDecoderFixed()

	# Copy embedding weights
	decoder.embed_tokens.weight.data.copy_(orig_lm.embed_tokens.weight.data)

	# Copy final norm weight
	decoder.norm_weight.data.copy_(orig_lm.norm.weight.data)

	# Copy LM head (tied with embeddings)
	decoder.lm_head.weight.data.copy_(orig_lm.embed_tokens.weight.data)

	# Copy layer weights
	for i in range(NUM_LAYERS):
	orig_layer = orig_lm.layers[i]
	fixed_layer = decoder.layers[i]

	# Attention projections
	fixed_layer.self_attn.q_proj.weight.data.copy_(orig_layer.self_attn.q_proj.weight.data)
	fixed_layer.self_attn.k_proj.weight.data.copy_(orig_layer.self_attn.k_proj.weight.data)
	fixed_layer.self_attn.v_proj.weight.data.copy_(orig_layer.self_attn.v_proj.weight.data)
	fixed_layer.self_attn.o_proj.weight.data.copy_(orig_layer.self_attn.o_proj.weight.data)

	# QK-norm weights
	fixed_layer.self_attn.q_norm_weight.data.copy_(orig_layer.self_attn.q_norm.weight.data)
	fixed_layer.self_attn.k_norm_weight.data.copy_(orig_layer.self_attn.k_norm.weight.data)

	# Layer norms
	fixed_layer.input_layernorm_weight.data.copy_(orig_layer.input_layernorm.weight.data)
	fixed_layer.post_attention_layernorm_weight.data.copy_(orig_layer.post_attention_layernorm.weight.data)

	# MLP
	fixed_layer.mlp.gate_proj.weight.data.copy_(orig_layer.mlp.gate_proj.weight.data)
	fixed_layer.mlp.up_proj.weight.data.copy_(orig_layer.mlp.up_proj.weight.data)
	fixed_layer.mlp.down_proj.weight.data.copy_(orig_layer.mlp.down_proj.weight.data)

	decoder.eval()
	total_params = sum(p.numel() for p in decoder.parameters())
	print(f" Decoder parameters: {total_params/1e6:.2f}M")

	return decoder


	def create_empty_kv_caches(batch_size: int = 1, dtype=torch.float32, device="cpu"):
	"""Create empty KV cache tensors for all layers."""
	caches = []
	for _ in range(NUM_LAYERS):
	k = torch.zeros(batch_size, NUM_KV_HEADS, MAX_SEQ_LEN, HEAD_DIM, dtype=dtype, device=device)
	v = torch.zeros(batch_size, NUM_KV_HEADS, MAX_SEQ_LEN, HEAD_DIM, dtype=dtype, device=device)
	caches.extend([k, v])
	return tuple(caches)


	def create_causal_mask(seq_len: int, cache_len: int = MAX_SEQ_LEN, dtype=torch.float32):
	"""Create causal attention mask."""
	mask = torch.full((seq_len, cache_len), float("-inf"), dtype=dtype)
	mask = torch.triu(mask, diagonal=cache_len - seq_len + 1)
	return mask.unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, cache_len]


	def test_decoder_module(decoder, original_model):
	"""Test that the fixed decoder produces same output as original."""
	print("\nTesting decoder output consistency...")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	decoder = decoder.to(device).to(torch.bfloat16)
	original_model = original_model.to(device)

	# Test input
	input_ids = torch.tensor([[1, 2, 3, 4, 5]], device=device)
	seq_len = input_ids.shape[1]
	position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
	cache_position = torch.arange(seq_len, device=device)

	# Causal mask
	mask = create_causal_mask(seq_len, dtype=torch.bfloat16).to(device)

	# Empty KV caches
	kv_caches = create_empty_kv_caches(1, torch.bfloat16, device)

	with torch.no_grad():
	# Fixed decoder
	result = decoder(input_ids, mask, position_ids, cache_position, *kv_caches)
	fixed_logits = result[0]
	print(f" Fixed decoder output shape: {fixed_logits.shape}")

	# Original model (text-only, no image)
	orig_outputs = original_model(
	input_ids=input_ids,
	attention_mask=torch.ones_like(input_ids),
	use_cache=False,
	)
	orig_logits = orig_outputs.logits[:, -1:, :]
	print(f" Original model output shape: {orig_logits.shape}")

	# Compare
	diff = (fixed_logits.float() - orig_logits.float()).abs()
	print(f" Max absolute difference: {diff.max().item():.6f}")
	print(f" Mean absolute difference: {diff.mean().item():.6f}")

	# Check top-k predictions match
	fixed_topk = fixed_logits.float().topk(5, dim=-1)
	orig_topk = orig_logits.float().topk(5, dim=-1)
	print(f" Fixed top-5 token IDs: {fixed_topk.indices[0, 0].tolist()}")
	print(f" Original top-5 token IDs: {orig_topk.indices[0, 0].tolist()}")
	matching = sum(1 for t in fixed_topk.indices[0, 0].tolist() if t in orig_topk.indices[0, 0].tolist())
	print(f" Top-5 overlap: {matching}/5")


	def try_torch_export(decoder):
	"""Attempt torch.export.export() on the decoder."""
	print("\n" + "=" * 60)
	print("ATTEMPTING torch.export.export() on decoder")
	print("=" * 60)

	# Export on CPU with float32 for XNNPACK
	decoder = decoder.to("cpu").to(torch.float32)
	decoder.eval()

	batch_size = 1
	seq_len = 1 # Export for single-token decode step (simpler)

	input_ids = torch.randint(0, VOCAB_SIZE, (batch_size, seq_len))
	attention_mask = create_causal_mask(seq_len, MAX_SEQ_LEN, torch.float32)
	position_ids = torch.zeros(batch_size, seq_len, dtype=torch.long)
	cache_position = torch.zeros(seq_len, dtype=torch.long)
	kv_caches = create_empty_kv_caches(batch_size, torch.float32, "cpu")

	example_args = (input_ids, attention_mask, position_ids, cache_position, *kv_caches)

	try:
	print(f" Exporting with seq_len={seq_len}, max_cache={MAX_SEQ_LEN}...")
	print(f" Number of input tensors: {len(example_args)} (4 + {NUM_LAYERS}*2 KV caches)")
	exported = torch.export.export(
	decoder,
	example_args,
	strict=False,
	)
	print(" SUCCESS! torch.export completed!")
	return exported

	except Exception as e:
	print(f" FAILED: {type(e).__name__}: {e}")
	import traceback
	traceback.print_exc()

	# Try with trace as fallback
	print("\n Trying torch.jit.trace as fallback...")
	try:
	traced = torch.jit.trace(decoder, example_args)
	print(" torch.jit.trace succeeded!")
	return traced
	except Exception as e2:
	print(f" torch.jit.trace also failed: {type(e2).__name__}: {e2}")

	return None


	def export_to_pte(exported_model):
	"""Convert exported model to .pte using XNNPACK backend."""
	print("\n" + "=" * 60)
	print("EXPORTING DECODER TO .pte (XNNPACK)")
	print("=" * 60)

	try:
	from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
	from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner

	if not hasattr(exported_model, 'graph_module'):
	print(" Need torch.export.export() result for .pte export")
	return None

	print(" Running to_edge_transform_and_lower...")
	edge = to_edge_transform_and_lower(
	exported_model,
	compile_config=EdgeCompileConfig(_check_ir_validity=False),
	partitioner=[XnnpackPartitioner()],
	)

	print(" Running to_executorch()...")
	pte = edge.to_executorch()

	output_path = "text_decoder.pte"
	with open(output_path, "wb") as f:
	f.write(pte.buffer)

	file_size = os.path.getsize(output_path) / (1024 * 1024)
	print(f" Saved to {output_path} ({file_size:.1f} MB)")
	return output_path

	except ImportError as e:
	print(f" ExecuTorch import failed: {e}")
	return None
	except Exception as e:
	print(f" Export failed: {type(e).__name__}: {e}")
	import traceback
	traceback.print_exc()
	return None


	def main():
	print("=" * 60)
	print("Text Decoder Export for ExecuTorch")
	print(f"Architecture: Qwen3 {NUM_LAYERS}L, {NUM_HEADS}H/{NUM_KV_HEADS}KV, dim={HIDDEN_SIZE}")
	print(f"Max seq len: {MAX_SEQ_LEN}")
	print(f"KV cache size per layer: {NUM_KV_HEADS}x{MAX_SEQ_LEN}x{HEAD_DIM} = {NUM_KV_HEADSMAX_SEQ_LENHEAD_DIM/1e6:.2f}M elements")
	print("=" * 60)

	# Load original model
	original_model = load_original_model()

	# Build fixed decoder
	decoder = build_decoder_module(original_model)

	# Test consistency
	test_decoder_module(decoder, original_model)

	# Free original model memory
	del original_model
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	# Try torch.export
	exported = try_torch_export(decoder)

	if exported is not None:
	export_to_pte(exported)

	# Save the PyTorch module for later use
	torch.save(decoder.state_dict(), "text_decoder_fixed.pt")
	print(f"\nSaved fixed decoder state dict to text_decoder_fixed.pt")
	print("Decoder export script complete!")


	if __name__ == "__main__":
	main()