bitmar-attention-multimodal / modeling_bitmar.py

Fix BitMarModel class and tensor shapes for main

db3ebcb verified 5 months ago

70.4 kB

	"""
	BitMar Model for Hugging Face Transformers
	BitNet-quantized Vision-Language Episodic Memory Transformer
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import logging
	import math
	import os
	import pickle
	import gzip
	from typing import Dict, List, Optional, Tuple, Union
	from transformers import PreTrainedModel, PretrainedConfig
	from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
	import time

	logger = logging.getLogger(__name__)


	class BitMarConfig(PretrainedConfig):
	"""Configuration class for BitMar model"""

	model_type = "bitmar"

	def __init__(
	self,
	vocab_size: int = 50257,
	text_encoder_dim: int = 128,
	text_encoder_layers: int = 4,
	text_encoder_heads: int = 4,
	text_decoder_dim: int = 128,
	text_decoder_layers: int = 4,
	text_decoder_heads: int = 4,
	vision_encoder_dim: int = 768,
	vision_latent_size: int = 128,
	vision_hidden_size: int = 64,
	vision_compression_method: str = "learned_compression",
	vision_spatial_pooling: bool = True,
	vision_pool_size: int = 2,
	fusion_hidden_size: int = 128,
	fusion_num_heads: int = 4,
	fusion_num_layers: int = 2,
	memory_size: int = 32,
	episode_dim: int = 128,
	memory_alpha: float = 0.2,
	direct_writing: bool = True,
	memory_compression: bool = True,
	max_seq_len: int = 256,
	dropout: float = 0.15,
	initializer_range: float = 0.02,
	layer_norm_epsilon: float = 1e-5,
	use_cache: bool = True,
	tie_word_embeddings: bool = True,
	pad_token_id: int = 50256,
	bos_token_id: int = 50256,
	eos_token_id: int = 50256,
	**kwargs
	):
	super().__init__(
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	**kwargs
	)

	self.vocab_size = vocab_size
	self.text_encoder_dim = text_encoder_dim
	self.text_encoder_layers = text_encoder_layers
	self.text_encoder_heads = text_encoder_heads
	self.text_decoder_dim = text_decoder_dim
	self.text_decoder_layers = text_decoder_layers
	self.text_decoder_heads = text_decoder_heads
	self.vision_encoder_dim = vision_encoder_dim
	self.vision_latent_size = vision_latent_size
	self.vision_hidden_size = vision_hidden_size
	self.vision_compression_method = vision_compression_method
	self.vision_spatial_pooling = vision_spatial_pooling
	self.vision_pool_size = vision_pool_size
	self.fusion_hidden_size = fusion_hidden_size
	self.fusion_num_heads = fusion_num_heads
	self.fusion_num_layers = fusion_num_layers
	self.memory_size = memory_size
	self.episode_dim = episode_dim
	self.memory_alpha = memory_alpha
	self.direct_writing = direct_writing
	self.memory_compression = memory_compression
	self.max_seq_len = max_seq_len
	self.dropout = dropout
	self.initializer_range = initializer_range
	self.layer_norm_epsilon = layer_norm_epsilon
	self.use_cache = use_cache
	self.tie_word_embeddings = tie_word_embeddings


	class BitNetLinear(nn.Module):
	"""1.58-bit Linear layer following BitNet b1.58 architecture - FIXED VERSION"""

	def __init__(self, in_features: int, out_features: int, bias: bool = True):
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features

	# Weight parameters (full precision for training)
	self.weight = nn.Parameter(torch.randn(out_features, in_features))
	self.bias = nn.Parameter(torch.zeros(out_features)) if bias else None

	# FIXED
	self.register_buffer('weight_scale', torch.tensor(1.0))
	self.register_buffer('input_scale', torch.tensor(1.0))

	def quantize_weights_1_58_bit(self, weight: torch.Tensor) -> torch.Tensor:
	"""BitNet b1.58 weight quantization: {-1, 0, +1}"""

	# Handle empty tensors
	if weight.numel() == 0:
	return weight

	# Compute scaling factor with numerical stability
	scale = weight.abs().mean()

	# Handle case where all weights are zero
	if scale < 1e-8:
	scale = torch.tensor(1e-5, device=weight.device, dtype=weight.dtype)

	self.weight_scale.data = scale.clamp(min=1e-5, max=1e3)

	# Normalize weights with gradient clipping
	weight_norm = torch.clamp(weight / self.weight_scale, min=-10.0, max=10.0)

	# 1.58-bit quantization with threshold
	threshold = 2.0 / 3.0 # Optimal threshold for ternary quantization

	# Create ternary weights
	quantized = torch.zeros_like(weight_norm)
	quantized[weight_norm > threshold] = 1.0
	quantized[weight_norm < -threshold] = -1.0
	# Values between -threshold and threshold remain 0

	return quantized

	def quantize_activations_8bit(self, x: torch.Tensor) -> torch.Tensor:
	"""8-bit activation quantization with numerical stability"""

	# Handle empty tensors
	if x.numel() == 0:
	return x

	# Clamp extreme values to prevent overflow
	x_clamped = torch.clamp(x, min=-1e6, max=1e6)

	# Handle scalar tensors
	if x_clamped.numel() == 1:
	return x_clamped

	# Compute quantization parameters
	x_min, x_max = x_clamped.min(), x_clamped.max()

	# Prevent division by zero
	range_val = x_max - x_min
	if range_val < 1e-8:
	return x_clamped

	scale = range_val / 255.0
	self.input_scale.data = scale.clamp(min=1e-8, max=1e3)

	# Quantize to 8-bit
	zero_point = (-x_min / scale).round().clamp(0, 255)
	quantized = ((x_clamped / scale) + zero_point).round().clamp(0, 255)

	# Dequantize
	dequantized = scale * (quantized - zero_point)
	return dequantized

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	if self.training:
	# Full precision training with straight-through estimator
	# Forward pass with quantized weights but gradients flow through original weights
	weight_q = self.quantize_weights_1_58_bit(self.weight)
	weight_forward = weight_q * self.weight_scale

	# Use original weight for gradient computation
	weight_forward = weight_forward + (self.weight - self.weight.detach())

	return F.linear(x, weight_forward, self.bias)
	else:
	# Inference with full quantization
	weight_q = self.quantize_weights_1_58_bit(self.weight) * self.weight_scale
	x_q = self.quantize_activations_8bit(x)
	return F.linear(x_q, weight_q, self.bias)


	class BitNetMLP(nn.Module):
	"""BitNet MLP block with 1.58-bit quantization"""

	def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.1):
	super().__init__()
	self.fc1 = BitNetLinear(dim, hidden_dim)
	self.fc2 = BitNetLinear(hidden_dim, dim)
	self.activation = nn.GELU()
	self.dropout = nn.Dropout(dropout)
	self.norm = nn.LayerNorm(dim)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	residual = x
	x = self.fc1(x)
	x = self.activation(x)
	x = self.dropout(x)
	x = self.fc2(x)
	x = self.dropout(x)
	return self.norm(x + residual)


	class BitNetAttention(nn.Module):
	"""Multi-head attention with BitNet quantization"""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	dropout: float = 0.1,
	bias: bool = True
	):
	super().__init__()
	assert dim % num_heads == 0

	self.dim = dim
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.scale = self.head_dim ** -0.5

	# BitNet quantized projections
	self.q_proj = BitNetLinear(dim, dim, bias=bias)
	self.k_proj = BitNetLinear(dim, dim, bias=bias)
	self.v_proj = BitNetLinear(dim, dim, bias=bias)
	self.out_proj = BitNetLinear(dim, dim, bias=bias)

	self.dropout = nn.Dropout(dropout)

	def forward(
	self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, torch.Tensor]:
	batch_size, seq_len = query.shape[:2]

	# Validate input dimensions
	if query.size(-1) != self.dim:
	raise ValueError(f"Query dimension {query.size(-1)} doesn't match expected {self.dim}")
	if key.size(-1) != self.dim:
	raise ValueError(f"Key dimension {key.size(-1)} doesn't match expected {self.dim}")
	if value.size(-1) != self.dim:
	raise ValueError(f"Value dimension {value.size(-1)} doesn't match expected {self.dim}")

	# Linear projections
	q = self.q_proj(query)
	k = self.k_proj(key)
	v = self.v_proj(value)

	# Get key/value sequence length (handle different shapes)
	key_seq_len = key.size(1)

	# Reshape for multi-head attention with proper dimension checking
	q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
	k = k.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
	v = v.view(batch_size, key_seq_len, self.num_heads, self.head_dim).transpose(1, 2)

	# Attention computation
	attention_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale

	if mask is not None:
	# Handle mask shape: expand to match attention scores shape
	if mask.dim() == 2: # [batch_size, seq_len]
	mask = mask.unsqueeze(1).unsqueeze(1) # [batch_size, 1, 1, seq_len]
	elif mask.dim() == 3: # [batch_size, seq_len, seq_len]
	mask = mask.unsqueeze(1) # [batch_size, 1, seq_len, seq_len]

	# Expand mask to match attention scores shape [batch_size, num_heads, seq_len, key_seq_len]
	if mask.size(-1) != key_seq_len:
	# Adjust mask if needed
	if mask.size(-1) == seq_len:
	# Pad or trim mask to match key_seq_len
	if key_seq_len > seq_len:
	pad_size = key_seq_len - seq_len
	mask = torch.cat([mask, torch.zeros(*mask.shape[:-1], pad_size, device=mask.device, dtype=mask.dtype)], dim=-1)
	else:
	mask = mask[..., :key_seq_len]

	mask = mask.expand(batch_size, self.num_heads, seq_len, key_seq_len)
	attention_scores.masked_fill_(mask == 0, float('-inf'))

	attention_weights = F.softmax(attention_scores, dim=-1)
	attention_weights = self.dropout(attention_weights)

	# Apply attention to values
	attended = torch.matmul(attention_weights, v)

	# Reshape and project output
	attended = attended.transpose(1, 2).contiguous().view(
	batch_size, seq_len, self.dim
	)
	output = self.out_proj(attended)

	return output, attention_weights.mean(dim=1) # Average across heads


	class BitNetTransformerBlock(nn.Module):
	"""BitNet Transformer block with quantized components"""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	dropout: float = 0.1
	):
	super().__init__()

	self.norm1 = nn.LayerNorm(dim)
	self.attn = BitNetAttention(dim, num_heads, dropout)

	self.norm2 = nn.LayerNorm(dim)
	self.mlp = BitNetMLP(dim, int(dim * mlp_ratio), dropout)

	def forward(
	self,
	x: torch.Tensor,
	mask: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, torch.Tensor]:
	# Self-attention with residual connection
	normed_x = self.norm1(x)
	attn_out, attn_weights = self.attn(normed_x, normed_x, normed_x, mask)
	x = x + attn_out

	# MLP with residual connection
	x = x + self.mlp(self.norm2(x))

	return x, attn_weights


	class BitNetTextEncoder(nn.Module):
	"""BitNet-based text encoder"""

	def __init__(
	self,
	vocab_size: int,
	dim: int,
	num_layers: int,
	num_heads: int,
	max_seq_len: int = 512,
	dropout: float = 0.1
	):
	super().__init__()
	self.dim = dim
	self.max_seq_len = max_seq_len

	# Token embeddings (kept full precision)
	self.token_embedding = nn.Embedding(vocab_size, dim)
	self.position_embedding = nn.Embedding(max_seq_len, dim)

	# BitNet transformer layers
	self.layers = nn.ModuleList([
	BitNetTransformerBlock(dim, num_heads, dropout=dropout)
	for _ in range(num_layers)
	])

	self.dropout = nn.Dropout(dropout)
	self.norm = nn.LayerNorm(dim)

	# Initialize embeddings
	nn.init.normal_(self.token_embedding.weight, std=0.02)
	nn.init.normal_(self.position_embedding.weight, std=0.02)

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, List[torch.Tensor]]:
	batch_size, seq_len = input_ids.shape

	# Embeddings
	positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
	x = self.token_embedding(input_ids) + \
	self.position_embedding(positions)
	x = self.dropout(x)

	# Transform through BitNet layers
	attention_patterns = []
	for layer in self.layers:
	# Convert attention mask to the right format for the layer
	layer_mask = None
	if attention_mask is not None:
	# Create a mask where 1 means attend, 0 means don't attend
	layer_mask = attention_mask.unsqueeze(
	1).unsqueeze(2) # [batch_size, 1, 1, seq_len]

	x, attn_weights = layer(x, layer_mask)
	attention_patterns.append(attn_weights)

	x = self.norm(x)
	return x, attention_patterns


	class BitNetTextDecoder(nn.Module):
	"""BitNet-based text decoder with causal masking"""

	def __init__(
	self,
	vocab_size: int,
	dim: int,
	num_layers: int,
	num_heads: int,
	max_seq_len: int = 512,
	dropout: float = 0.1
	):
	super().__init__()
	self.dim = dim
	self.max_seq_len = max_seq_len

	# Token embeddings
	self.token_embedding = nn.Embedding(vocab_size, dim)
	self.position_embedding = nn.Embedding(max_seq_len, dim)

	# BitNet transformer layers
	self.layers = nn.ModuleList([
	BitNetTransformerBlock(dim, num_heads, dropout=dropout)
	for _ in range(num_layers)
	])

	self.dropout = nn.Dropout(dropout)
	self.norm = nn.LayerNorm(dim)

	# Output projection to vocabulary
	self.lm_head = BitNetLinear(dim, vocab_size, bias=False)

	# Initialize embeddings
	nn.init.normal_(self.token_embedding.weight, std=0.02)
	nn.init.normal_(self.position_embedding.weight, std=0.02)

	# Register causal mask
	self.register_buffer(
	'causal_mask',
	torch.tril(torch.ones(max_seq_len, max_seq_len)
	).unsqueeze(0).unsqueeze(0)
	)

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None
	) -> Dict[str, torch.Tensor]:

	if input_ids is not None:
	batch_size, seq_len = input_ids.shape
	positions = torch.arange(
	seq_len, device=input_ids.device).unsqueeze(0)
	x = self.token_embedding(input_ids) + \
	self.position_embedding(positions)
	elif inputs_embeds is not None:
	batch_size, seq_len = inputs_embeds.shape[:2]
	positions = torch.arange(
	seq_len, device=inputs_embeds.device).unsqueeze(0)
	x = inputs_embeds + self.position_embedding(positions)
	else:
	raise ValueError(
	"Either input_ids or inputs_embeds must be provided")

	x = self.dropout(x)

	# Create causal mask
	causal_mask = self.causal_mask[:, :, :seq_len, :seq_len]
	if attention_mask is not None:
	# Combine causal mask with padding mask
	mask = attention_mask.unsqueeze(1).unsqueeze(2) * causal_mask
	else:
	mask = causal_mask

	# Transform through BitNet layers
	attention_patterns = []
	for layer in self.layers:
	x, attn_weights = layer(x, mask)
	attention_patterns.append(attn_weights)

	x = self.norm(x)
	logits = self.lm_head(x)

	loss = None
	if labels is not None:
	# Shift labels for causal LM
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss = F.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1),
	ignore_index=-100
	)

	return {
	'logits': logits,
	'loss': loss,
	'attention_patterns': attention_patterns
	}


	class EpisodicMemory(nn.Module):
	"""Episodic Memory mechanism inspired by Larimar with performance optimizations and external storage support"""

	def __init__(
	self,
	memory_size: int,
	episode_dim: int,
	alpha: float = 0.1,
	direct_writing: bool = True,
	observation_noise_std: float = 1e-6,
	external_storage: bool = False,
	memory_storage_path: str = None,
	compression_enabled: bool = True,
	lazy_loading: bool = False
	):
	super().__init__()
	self.memory_size = memory_size
	self.episode_dim = episode_dim
	self.alpha = alpha
	self.direct_writing = direct_writing
	self.observation_noise_std = observation_noise_std

	# External storage configuration
	self.external_storage = external_storage
	self.memory_storage_path = memory_storage_path
	self.compression_enabled = compression_enabled
	self.lazy_loading = lazy_loading
	self._memory_loaded = False
	self._memory_version = 1

	# Memory storage with improved initialization
	if external_storage and lazy_loading:
	# For lazy loading, we'll initialize empty and load when needed
	self._memory_data = None
	self._metadata = None
	else:
	# Standard initialization for compatibility
	self.register_buffer('memory', torch.randn(memory_size, episode_dim) * 0.02)
	self.register_buffer('memory_age', torch.zeros(memory_size))
	self.register_buffer('memory_usage', torch.zeros(memory_size))

	# Always initialize these for proper functioning
	self.register_buffer('memory_quality', torch.zeros(memory_size))
	self.register_buffer('memory_importance', torch.ones(memory_size))
	self.register_buffer('memory_mean', torch.zeros(episode_dim))
	self.register_buffer('memory_std', torch.ones(episode_dim))
	self.register_buffer('update_count', torch.tensor(0))

	# Enhanced memory access networks with residual connections
	self.query_net = nn.Sequential(
	BitNetLinear(episode_dim, episode_dim),
	nn.LayerNorm(episode_dim),
	nn.GELU(),
	BitNetLinear(episode_dim, episode_dim)
	)
	self.key_net = nn.Sequential(
	BitNetLinear(episode_dim, episode_dim),
	nn.LayerNorm(episode_dim),
	nn.GELU(),
	BitNetLinear(episode_dim, episode_dim)
	)
	self.value_net = nn.Sequential(
	BitNetLinear(episode_dim, episode_dim),
	nn.LayerNorm(episode_dim),
	nn.GELU(),
	BitNetLinear(episode_dim, episode_dim)
	)

	# Add temperature parameter for attention sharpening
	self.register_parameter('attention_temperature', nn.Parameter(torch.tensor(1.0)))

	# Memory consolidation network for better episode encoding
	self.consolidation_net = nn.Sequential(
	BitNetLinear(episode_dim, episode_dim * 2),
	nn.LayerNorm(episode_dim * 2),
	nn.GELU(),
	nn.Dropout(0.1),
	BitNetLinear(episode_dim * 2, episode_dim),
	nn.LayerNorm(episode_dim)
	)

	def _ensure_memory_loaded(self):
	"""Ensure memory is loaded into device memory"""
	if self.external_storage and self.lazy_loading and not self._memory_loaded:
	self.load_external_memory()
	elif not hasattr(self, 'memory'):
	# Initialize if not present (compatibility mode)
	self.register_buffer('memory', torch.randn(self.memory_size, self.episode_dim) * 0.02)
	self.register_buffer('memory_age', torch.zeros(self.memory_size))
	self.register_buffer('memory_usage', torch.zeros(self.memory_size))

	def save_external_memory(self, path: str = None, compress: bool = None) -> str:
	"""Save episodic memory to external storage"""
	import os
	import json
	from pathlib import Path

	# Use provided path or default
	save_path = path or self.memory_storage_path or "episodic_memory.pt"
	save_path = Path(save_path)
	save_path.parent.mkdir(parents=True, exist_ok=True)

	# Use provided compression setting or default
	use_compression = compress if compress is not None else self.compression_enabled

	# Prepare memory data
	memory_data = {
	'memory': self.memory.cpu() if hasattr(self, 'memory') else torch.randn(self.memory_size, self.episode_dim) * 0.02,
	'memory_age': self.memory_age.cpu() if hasattr(self, 'memory_age') else torch.zeros(self.memory_size),
	'memory_usage': self.memory_usage.cpu() if hasattr(self, 'memory_usage') else torch.zeros(self.memory_size),
	'memory_quality': self.memory_quality.cpu(),
	'memory_importance': self.memory_importance.cpu(),
	'memory_mean': self.memory_mean.cpu(),
	'memory_std': self.memory_std.cpu(),
	'update_count': self.update_count.cpu(),
	'version': self._memory_version,
	'metadata': {
	'memory_size': self.memory_size,
	'episode_dim': self.episode_dim,
	'alpha': self.alpha,
	'creation_timestamp': torch.tensor(time.time()),
	'compression_enabled': use_compression
	}
	}

	# Apply compression if enabled
	if use_compression:
	# Quantize memory to reduce storage size
	memory_data['memory'] = self._compress_memory_tensor(memory_data['memory'])
	memory_data['compressed'] = True
	else:
	memory_data['compressed'] = False

	# Save to file
	torch.save(memory_data, save_path)

	# Also save metadata separately for quick access
	metadata_path = save_path.with_suffix('.json')
	with open(metadata_path, 'w') as f:
	json.dump({
	'memory_size': self.memory_size,
	'episode_dim': self.episode_dim,
	'version': self._memory_version,
	'compressed': use_compression,
	'file_size_mb': save_path.stat().st_size / (1024 * 1024),
	'creation_timestamp': time.time()
	}, f, indent=2)

	logger.info(f"💾 Episodic memory saved to: {save_path}")
	logger.info(f"📊 Memory size: {save_path.stat().st_size / 1024:.1f} KB")

	return str(save_path)

	def load_external_memory(self, path: str = None, device: str = None) -> bool:
	"""Load episodic memory from external storage"""
	import json
	from pathlib import Path

	# Use provided path or default
	load_path = path or self.memory_storage_path or "episodic_memory.pt"
	load_path = Path(load_path)

	if not load_path.exists():
	logger.warning(f"⚠️ External memory file not found: {load_path}")
	return False

	try:
	# Load memory data
	memory_data = torch.load(load_path, map_location='cpu')

	# Validate compatibility
	if memory_data['metadata']['memory_size'] != self.memory_size:
	logger.error(f"❌ Memory size mismatch: expected {self.memory_size}, got {memory_data['metadata']['memory_size']}")
	return False

	if memory_data['metadata']['episode_dim'] != self.episode_dim:
	logger.error(f"❌ Episode dimension mismatch: expected {self.episode_dim}, got {memory_data['metadata']['episode_dim']}")
	return False

	# Set device
	device = device or next(self.parameters()).device

	# Decompress if needed
	if memory_data.get('compressed', False):
	memory_tensor = self._decompress_memory_tensor(memory_data['memory'])
	else:
	memory_tensor = memory_data['memory']

	# Load memory tensors
	if hasattr(self, 'memory'):
	self.memory.copy_(memory_tensor.to(device))
	self.memory_age.copy_(memory_data['memory_age'].to(device))
	self.memory_usage.copy_(memory_data['memory_usage'].to(device))
	else:
	# Register buffers if not present (lazy loading case)
	self.register_buffer('memory', memory_tensor.to(device))
	self.register_buffer('memory_age', memory_data['memory_age'].to(device))
	self.register_buffer('memory_usage', memory_data['memory_usage'].to(device))

	self.memory_quality.copy_(memory_data['memory_quality'].to(device))
	self.memory_importance.copy_(memory_data['memory_importance'].to(device))
	self.memory_mean.copy_(memory_data['memory_mean'].to(device))
	self.memory_std.copy_(memory_data['memory_std'].to(device))
	self.update_count.copy_(memory_data['update_count'].to(device))

	self._memory_version = memory_data.get('version', 1)
	self._memory_loaded = True

	logger.info(f"✅ Episodic memory loaded from: {load_path}")
	logger.info(f"📊 Memory version: {self._memory_version}")

	return True

	except Exception as e:
	logger.error(f"❌ Failed to load external memory: {e}")
	return False

	def _compress_memory_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
	"""Compress memory tensor for storage"""
	# Quantize to int8 to reduce storage size
	tensor_min = tensor.min()
	tensor_max = tensor.max()

	# Avoid division by zero
	tensor_range = tensor_max - tensor_min
	if tensor_range < 1e-8:
	return tensor

	# Quantize to int8 range
	quantized = ((tensor - tensor_min) / tensor_range * 255).round().clamp(0, 255).to(torch.uint8)

	# Store quantization parameters
	return {
	'data': quantized,
	'min': tensor_min,
	'max': tensor_max,
	'original_shape': tensor.shape
	}

	def _decompress_memory_tensor(self, compressed_data) -> torch.Tensor:
	"""Decompress memory tensor"""
	if isinstance(compressed_data, dict):
	quantized = compressed_data['data'].float()
	tensor_min = compressed_data['min']
	tensor_max = compressed_data['max']

	# Dequantize
	tensor_range = tensor_max - tensor_min
	dequantized = (quantized / 255.0) * tensor_range + tensor_min

	return dequantized.view(compressed_data['original_shape'])
	else:
	# Not compressed, return as-is
	return compressed_data

	def _update_memory_statistics(self, episodes: torch.Tensor):
	"""Update running statistics for memory normalization"""
	with torch.no_grad():
	batch_mean = episodes.mean(dim=0)
	batch_var = episodes.var(dim=0, unbiased=False)

	# Exponential moving average
	momentum = 0.1
	self.memory_mean = (1 - momentum) * self.memory_mean + momentum * batch_mean
	self.memory_std = torch.sqrt((1 - momentum) * self.memory_std*2 + momentum batch_var)
	self.update_count += 1

	def _normalize_episodes(self, episodes: torch.Tensor) -> torch.Tensor:
	"""Normalize episodes using running statistics"""
	if self.update_count > 10: # Only normalize after some updates
	return (episodes - self.memory_mean) / (self.memory_std + 1e-8)
	return episodes

	def _compute_episode_quality(self, episode: torch.Tensor, retrieved: torch.Tensor) -> torch.Tensor:
	"""Compute quality score for memory episodes"""
	# Quality based on diversity and relevance
	similarity_to_memory = torch.cosine_similarity(
	episode.unsqueeze(1), self.memory.unsqueeze(0), dim=-1
	).max(dim=1)[0]

	# Encourage diversity - lower similarity = higher quality
	diversity_score = 1.0 - similarity_to_memory

	# Relevance score based on retrieval quality
	retrieval_quality = torch.cosine_similarity(episode, retrieved, dim=-1)

	# Combined quality score
	return 0.7 * diversity_score + 0.3 * retrieval_quality

	def write_memory(self, episode: torch.Tensor) -> torch.Tensor:
	"""Optimized memory writing with intelligent slot selection"""
	batch_size = episode.size(0)

	# Apply consolidation to improve episode representation
	consolidated_episode = self.consolidation_net(episode) + episode # Residual connection

	# Update statistics
	self._update_memory_statistics(consolidated_episode)

	# Normalize episodes
	normalized_episode = self._normalize_episodes(consolidated_episode)

	if self.direct_writing:
	# Enhanced slot selection combining age, usage, and quality
	if batch_size <= self.memory_size:
	# Compute composite scores for slot selection
	age_scores = -self.memory_age # Prefer older slots
	usage_scores = -self.memory_usage # Prefer less used slots
	quality_scores = -self.memory_quality # Prefer lower quality slots
	importance_scores = -self.memory_importance # Prefer less important slots

	# Weighted combination
	composite_scores = (
	0.4 * age_scores +
	0.3 * usage_scores +
	0.2 * quality_scores +
	0.1 * importance_scores
	)

	_, best_indices = composite_scores.topk(batch_size, largest=True)

	# Update memory slots with momentum-based updates
	momentum = self.alpha
	self.memory[best_indices] = (
	(1 - momentum) * self.memory[best_indices] +
	momentum * normalized_episode.detach()
	)

	# Update metadata
	self.memory_age[best_indices] = self.memory_age.max() + 1
	self.memory_usage[best_indices] += 1

	# Update quality scores (will be computed during read)
	with torch.no_grad():
	# Temporary quality estimation based on internal consistency
	temp_quality = torch.norm(normalized_episode, dim=-1)
	self.memory_quality[best_indices] = temp_quality.detach()

	else:
	# Handle large batches efficiently
	for i in range(0, batch_size, self.memory_size):
	end_idx = min(i + self.memory_size, batch_size)
	chunk_size = end_idx - i

	# Apply same logic for chunks
	age_scores = -self.memory_age
	usage_scores = -self.memory_usage
	quality_scores = -self.memory_quality
	importance_scores = -self.memory_importance

	composite_scores = (
	0.4 * age_scores +
	0.3 * usage_scores +
	0.2 * quality_scores +
	0.1 * importance_scores
	)

	_, chunk_indices = composite_scores.topk(chunk_size, largest=True)

	momentum = self.alpha
	self.memory[chunk_indices] = (
	(1 - momentum) * self.memory[chunk_indices] +
	momentum * normalized_episode[i:end_idx].detach()
	)

	self.memory_age[chunk_indices] = self.memory_age.max() + 1 + i
	self.memory_usage[chunk_indices] += 1

	temp_quality = torch.norm(normalized_episode[i:end_idx], dim=-1)
	self.memory_quality[chunk_indices] = temp_quality.detach()

	return consolidated_episode

	def read_memory(self, query: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Optimized memory reading with enhanced attention"""
	batch_size = query.size(0)

	# Validate query dimensions
	if query.size(-1) != self.episode_dim:
	raise ValueError(f"Query dimension {query.size(-1)} doesn't match memory episode_dim {self.episode_dim}")

	# Normalize query
	normalized_query = self._normalize_episodes(query)

	# Enhanced query, key, value computation with residual connections
	q = self.query_net(normalized_query) + normalized_query # Residual
	k = self.key_net(self.memory) + self.memory # Residual
	v = self.value_net(self.memory) + self.memory # Residual

	# Scaled dot-product attention with learnable temperature
	attention_scores = torch.matmul(q, k.transpose(0, 1)) / (
	math.sqrt(self.episode_dim) * self.attention_temperature.clamp(min=0.1, max=10.0)
	)

	# Add importance weighting to attention scores
	importance_weights = self.memory_importance.unsqueeze(0).expand(batch_size, -1)
	attention_scores = attention_scores + torch.log(importance_weights + 1e-8)

	# Apply attention with improved stability
	attention_weights = F.softmax(attention_scores, dim=-1)

	# Add attention dropout for regularization during training
	if self.training:
	attention_weights = F.dropout(attention_weights, p=0.1)

	# Weighted memory retrieval
	retrieved = torch.matmul(attention_weights, v)

	# Update memory access statistics and importance
	with torch.no_grad():
	access_counts = attention_weights.sum(0)
	self.memory_usage += access_counts

	# Update importance based on usage frequency
	self.memory_importance = 0.9 * self.memory_importance + 0.1 * (access_counts + 1e-8)

	# Update quality scores based on retrieval effectiveness
	if hasattr(self, '_last_query_quality'):
	quality_update = self._compute_episode_quality(query, retrieved)
	# Update quality for attended slots
	attended_indices = attention_weights.max(0)[1] # Most attended slots
	self.memory_quality[attended_indices] = (
	0.8 * self.memory_quality[attended_indices] +
	0.2 * quality_update.mean()
	)

	return retrieved, attention_weights

	def forward(self, episode: torch.Tensor, mode: str = "read_write") -> Tuple[torch.Tensor, torch.Tensor]:
	"""Enhanced forward pass with memory consolidation"""
	if mode == "write":
	return self.write_memory(episode), None
	elif mode == "read":
	return self.read_memory(episode)
	else: # read_write
	# Write episode to memory with consolidation
	consolidated_episode = self.write_memory(episode)

	# Read from memory using consolidated episode as query
	retrieved, attention_weights = self.read_memory(consolidated_episode)

	# Memory-augmented output combining input and retrieved memory
	output = 0.7 * consolidated_episode + 0.3 * retrieved

	return output, attention_weights

	def get_memory_statistics(self) -> Dict[str, torch.Tensor]:
	"""Get comprehensive memory statistics for monitoring"""
	return {
	'memory_usage_distribution': self.memory_usage,
	'memory_age_distribution': self.memory_age,
	'memory_quality_scores': self.memory_quality,
	'memory_importance': self.memory_importance,
	'attention_temperature': self.attention_temperature,
	'memory_utilization': (self.memory_usage > 0).float().mean(),
	'memory_diversity': torch.std(self.memory, dim=0).mean(),
	'update_count': self.update_count
	}

	def consolidate_memory(self):
	"""Explicit memory consolidation for improved organization"""
	with torch.no_grad():
	# Sort memory by importance and quality
	importance_quality_score = 0.6 * self.memory_importance + 0.4 * self.memory_quality
	sorted_indices = torch.argsort(importance_quality_score, descending=True)

	# Reorganize memory to group similar episodes
	sorted_memory = self.memory[sorted_indices]
	self.memory.copy_(sorted_memory)

	# Update corresponding metadata
	self.memory_age[:] = self.memory_age[sorted_indices]
	self.memory_usage[:] = self.memory_usage[sorted_indices]
	self.memory_quality[:] = self.memory_quality[sorted_indices]
	self.memory_importance[:] = self.memory_importance[sorted_indices]

	def get_memory_info(self) -> Dict:
	"""Get comprehensive memory information"""
	info = {
	'memory_size': self.memory_size,
	'episode_dim': self.episode_dim,
	'external_storage': self.external_storage,
	'compression_enabled': self.compression_enabled,
	'lazy_loading': self.lazy_loading,
	'memory_loaded': self._memory_loaded if self.external_storage else True,
	'version': self._memory_version,
	'storage_path': self.memory_storage_path
	}

	if hasattr(self, 'memory'):
	info.update({
	'memory_utilization': (self.memory_usage > 0).float().mean().item(),
	'memory_diversity': torch.std(self.memory, dim=0).mean().item(),
	'update_count': self.update_count.item(),
	'memory_device': str(self.memory.device)
	})

	return info

	def create_memory_snapshot(self, snapshot_name: str = None) -> str:
	"""Create a named snapshot of the current memory state"""
	import time
	from pathlib import Path

	timestamp = int(time.time())
	snapshot_name = snapshot_name or f"memory_snapshot_{timestamp}"

	# Create snapshots directory
	snapshots_dir = Path("memory_snapshots")
	snapshots_dir.mkdir(exist_ok=True)

	snapshot_path = snapshots_dir / f"{snapshot_name}.pt"

	# Save current memory state
	saved_path = self.save_external_memory(str(snapshot_path), compress=True)

	logger.info(f"📸 Memory snapshot created: {saved_path}")
	return saved_path

	def load_memory_snapshot(self, snapshot_name: str) -> bool:
	"""Load a named memory snapshot"""
	from pathlib import Path

	snapshots_dir = Path("memory_snapshots")
	snapshot_path = snapshots_dir / f"{snapshot_name}.pt"

	if not snapshot_path.exists():
	logger.warning(f"⚠️ Snapshot not found: {snapshot_path}")
	return False

	success = self.load_external_memory(str(snapshot_path))
	if success:
	logger.info(f"📸 Memory snapshot loaded: {snapshot_name}")

	return success

	def enable_external_storage(self, storage_path: str = None, compress: bool = True, lazy: bool = False):
	"""Enable external storage mode for edge deployment"""
	self.external_storage = True
	self.memory_storage_path = storage_path or "episodic_memory.pt"
	self.compression_enabled = compress
	self.lazy_loading = lazy

	logger.info(f"🔄 External storage enabled: {self.memory_storage_path}")
	logger.info(f" Compression: {compress}, Lazy loading: {lazy}")

	def disable_external_storage(self):
	"""Disable external storage and return to integrated mode"""
	# Ensure memory is loaded before disabling external storage
	self._ensure_memory_loaded()

	self.external_storage = False
	self.lazy_loading = False
	self._memory_loaded = True

	logger.info("🔄 External storage disabled, using integrated mode")

	# ...existing code for other methods...
	class CrossModalFusion(nn.Module):
	"""Cross-modal fusion module for text and vision features"""

	def __init__(
	self,
	text_dim: int,
	vision_dim: int,
	hidden_dim: int,
	num_heads: int = 8,
	num_layers: int = 2
	):
	super().__init__()
	self.text_dim = text_dim
	self.vision_dim = vision_dim
	self.hidden_dim = hidden_dim

	# Projection layers
	self.text_proj = BitNetLinear(text_dim, hidden_dim)
	self.vision_proj = BitNetLinear(vision_dim, hidden_dim)

	# Cross-attention layers
	self.cross_attention_layers = nn.ModuleList([
	BitNetAttention(
	dim=hidden_dim,
	num_heads=num_heads
	) for _ in range(num_layers)
	])

	# Layer normalization
	self.layer_norms = nn.ModuleList([
	nn.LayerNorm(hidden_dim) for _ in range(num_layers)
	])

	# Output projection
	self.output_proj = BitNetLinear(hidden_dim, hidden_dim)

	def forward(
	self,
	text_features: torch.Tensor,
	vision_features: torch.Tensor
	) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
	"""
	Args:
	text_features: [batch_size, seq_len, text_dim]
	vision_features: [batch_size, vision_dim]

	Returns:
	fused_features: [batch_size, seq_len, hidden_dim]
	attention_weights: Dict of attention patterns
	"""
	batch_size, seq_len = text_features.shape[:2]

	# Validate input dimensions
	if text_features.size(-1) != self.text_dim:
	raise ValueError(f"Text features dimension {text_features.size(-1)} doesn't match expected {self.text_dim}")
	if vision_features.size(-1) != self.vision_dim:
	raise ValueError(f"Vision features dimension {vision_features.size(-1)} doesn't match expected {self.vision_dim}")

	# Project to common dimension
	# [batch_size, seq_len, hidden_dim]
	text_proj = self.text_proj(text_features)
	vision_proj = self.vision_proj(vision_features).unsqueeze(1) # [batch_size, 1, hidden_dim]

	# Cross-attention fusion
	fused = text_proj
	attention_weights = {}

	for i, (attn_layer, norm_layer) in enumerate(zip(self.cross_attention_layers, self.layer_norms)):
	# Text-to-vision cross-attention
	attn_output, attn_weights = attn_layer(
	query=fused,
	key=vision_proj,
	value=vision_proj
	)

	# Residual connection and normalization
	fused = norm_layer(fused + attn_output)
	attention_weights[f'layer_{i}'] = attn_weights

	# Output projection
	output = self.output_proj(fused)

	return output, attention_weights


	class VisionEncoder(nn.Module):
	"""Quantized Vision Encoder for DiNOv2 features"""

	def __init__(
	self,
	input_dim: int = 768,
	hidden_dim: int = 512,
	output_dim: int = 768,
	num_layers: int = 2
	):
	super().__init__()

	# Quantized layers
	self.layers = nn.ModuleList([
	BitNetLinear(input_dim if i == 0 else hidden_dim, hidden_dim)
	for i in range(num_layers)
	])

	# Output projection
	self.output_proj = BitNetLinear(hidden_dim, output_dim)

	# Activation and normalization
	self.activation = nn.GELU()
	self.layer_norms = nn.ModuleList([
	nn.LayerNorm(hidden_dim) for _ in range(num_layers)
	])
	self.dropout = nn.Dropout(0.1)

	def forward(self, vision_features: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	vision_features: [batch_size, input_dim] - DiNOv2 features

	Returns:
	encoded_features: [batch_size, output_dim]
	"""
	# Handle potential extra dimensions
	if vision_features.dim() > 2:
	# Flatten any extra dimensions except batch
	original_shape = vision_features.shape
	vision_features = vision_features.view(original_shape[0], -1)

	# Ensure we have the expected input dimension
	if vision_features.size(-1) != self.layers[0].in_features:
	# Take only the first input_dim features if we have more
	if vision_features.size(-1) > self.layers[0].in_features:
	vision_features = vision_features[:, :self.layers[0].in_features]
	else:
	raise ValueError(f"Vision features dimension {vision_features.size(-1)} is smaller than expected {self.layers[0].in_features}")

	x = vision_features

	for layer, norm in zip(self.layers, self.layer_norms):
	x = layer(x)
	x = norm(x)
	x = self.activation(x)
	x = self.dropout(x)

	# Output projection
	output = self.output_proj(x)

	return output


	class BitMarModel(PreTrainedModel):
	"""
	BitMar: BitNet-quantized Vision-Language Episodic Memory Transformer
	Compatible with Hugging Face Transformers
	"""

	config_class = BitMarConfig
	base_model_prefix = "bitmar"
	supports_gradient_checkpointing = True
	_no_split_modules = ["BitNetTransformerBlock", "EpisodicMemory"]

	def __init__(self, config: BitMarConfig):
	super().__init__(config)
	self.config = config

	# Loss balancing parameters
	self.cross_modal_loss_weight = getattr(config, 'cross_modal_loss_weight', 0.1)
	self.text_loss_weight = getattr(config, 'text_loss_weight', 1.0)
	self.vision_loss_weight = getattr(config, 'vision_loss_weight', 0.1)
	self.memory_loss_weight = getattr(config, 'memory_loss_weight', 0.05)

	# Dynamic loss scaling
	self.adaptive_loss_scaling = getattr(config, 'adaptive_loss_scaling', True)
	self.loss_scale_temperature = getattr(config, 'loss_scale_temperature', 0.07)

	# Encoder freezing parameters
	self.freeze_text_encoder_steps = getattr(config, 'freeze_text_encoder_steps', 0)
	self.freeze_vision_encoder_steps = getattr(config, 'freeze_vision_encoder_steps', 0)
	self.current_step = 0

	# BitNet text encoder/decoder
	self.text_encoder = BitNetTextEncoder(
	vocab_size=config.vocab_size,
	dim=config.text_encoder_dim,
	num_layers=config.text_encoder_layers,
	num_heads=config.text_encoder_heads,
	max_seq_len=config.max_seq_len,
	dropout=config.dropout
	)

	self.text_decoder = BitNetTextDecoder(
	vocab_size=config.vocab_size,
	dim=config.text_decoder_dim,
	num_layers=config.text_decoder_layers,
	num_heads=config.text_decoder_heads,
	max_seq_len=config.max_seq_len,
	dropout=config.dropout
	)

	# Vision processing with BitNet quantization
	self.vision_encoder = VisionEncoder(
	input_dim=config.vision_encoder_dim,
	hidden_dim=config.vision_hidden_size,
	output_dim=config.vision_latent_size
	)

	# Cross-modal fusion with BitNet
	self.fusion = CrossModalFusion(
	text_dim=config.text_encoder_dim,
	vision_dim=config.vision_latent_size,
	hidden_dim=config.fusion_hidden_size,
	num_heads=config.fusion_num_heads,
	num_layers=config.fusion_num_layers
	)

	# Episodic memory with BitNet quantization
	self.memory = EpisodicMemory(
	memory_size=config.memory_size,
	episode_dim=config.episode_dim,
	alpha=config.memory_alpha,
	direct_writing=config.direct_writing
	)

	# Additional BitNet projection layers
	self.text_to_episode = BitNetLinear(
	config.text_encoder_dim,
	config.episode_dim
	)

	self.vision_to_episode = BitNetLinear(
	config.vision_latent_size,
	config.episode_dim
	)

	self.memory_to_decoder = BitNetLinear(
	config.episode_dim,
	config.fusion_hidden_size
	)

	# Projection to decoder dimension
	self.decoder_input_proj = BitNetLinear(
	config.fusion_hidden_size,
	config.text_decoder_dim
	)

	# Initialize tokenizer (for compatibility)
	try:
	from transformers import AutoTokenizer
	self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token
	except:
	self.tokenizer = None

	self.post_init()

	def _init_weights(self, module):
	"""Initialize the weights"""
	if isinstance(module, (nn.Linear, BitNetLinear)):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if hasattr(module, 'bias') and module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	if hasattr(module, 'bias') and module.bias is not None:
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)

	def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
	"""Encode text using BitNet encoder"""
	text_features, attention_patterns = self.text_encoder(
	input_ids=input_ids, attention_mask=attention_mask)
	return text_features, attention_patterns

	def encode_vision(self, vision_features: torch.Tensor) -> torch.Tensor:
	"""Encode vision features using quantized vision encoder"""
	vision_latent = self.vision_encoder(vision_features)
	return vision_latent

	def create_episode(
	self,
	text_features: torch.Tensor,
	vision_latent: torch.Tensor,
	attention_weights: Dict[str, torch.Tensor]
	) -> torch.Tensor:
	"""Create multimodal episode for memory storage"""
	# Pool text features (mean pooling)
	text_pooled = text_features.mean(dim=1)

	# Project both text and vision to episode dimension
	text_projected = self.text_to_episode(text_pooled)
	vision_projected = self.vision_to_episode(vision_latent)

	# Combine text and vision features
	episode = text_projected + vision_projected

	return episode

	def create_episode_mixed(
	self,
	text_features: torch.Tensor,
	vision_latent: torch.Tensor,
	attention_weights: Dict[str, torch.Tensor],
	has_vision: torch.Tensor
	) -> torch.Tensor:
	"""Create episodes with different handling for vision vs text-only samples"""
	batch_size = text_features.size(0)

	# Pool text features
	text_pooled = text_features.mean(dim=1)

	# Project to episode dimension
	text_episode = self.text_to_episode(text_pooled)
	vision_episode = self.vision_to_episode(vision_latent)

	# For text-only samples, use only text features
	# For multimodal samples, combine text and vision
	episode = torch.zeros_like(text_episode)

	# Text-only samples (has_vision == False)
	text_only_mask = ~has_vision
	if text_only_mask.any():
	episode[text_only_mask] = text_episode[text_only_mask]

	# Multimodal samples (has_vision == True)
	multimodal_mask = has_vision
	if multimodal_mask.any():
	# Combine text and vision for multimodal samples
	combined = text_episode[multimodal_mask] + vision_episode[multimodal_mask]
	episode[multimodal_mask] = combined

	return episode

	def compute_cross_modal_contrastive_loss(
	self,
	text_features: torch.Tensor,
	vision_features: torch.Tensor,
	temperature: float = 0.07
	) -> torch.Tensor:
	"""Compute cross-modal contrastive loss similar to CLIP"""
	batch_size = text_features.shape[0]

	# Handle dimension mismatch between text and vision features
	text_dim = text_features.shape[-1]
	vision_dim = vision_features.shape[-1]

	if text_dim != vision_dim:
	# Project to smaller dimension to maintain compatibility
	target_dim = min(text_dim, vision_dim)

	if text_dim > vision_dim:
	# Project text features to vision dimension
	text_features = text_features[:, :target_dim]
	else:
	# Project vision features to text dimension
	vision_features = vision_features[:, :target_dim]

	# Normalize features
	text_features = F.normalize(text_features, dim=-1)
	vision_features = F.normalize(vision_features, dim=-1)

	# Compute similarity matrix
	logits = torch.matmul(text_features, vision_features.T) / temperature

	# Create labels (diagonal should be positive pairs)
	labels = torch.arange(batch_size, device=logits.device)

	# Compute cross-entropy loss for both directions
	text_to_vision_loss = F.cross_entropy(logits, labels)
	vision_to_text_loss = F.cross_entropy(logits.T, labels)

	return (text_to_vision_loss + vision_to_text_loss) / 2

	def compute_vision_reconstruction_loss(
	self,
	original_vision: torch.Tensor,
	reconstructed_vision: torch.Tensor
	) -> torch.Tensor:
	"""Compute vision reconstruction loss to prevent vision encoder collapse"""
	return F.mse_loss(reconstructed_vision, original_vision)

	def compute_memory_consistency_loss(
	self,
	episode: torch.Tensor,
	retrieved_memory: torch.Tensor
	) -> torch.Tensor:
	"""Compute memory consistency loss to encourage meaningful memory usage"""
	# L2 regularization on memory difference
	memory_diff = episode - retrieved_memory
	return torch.mean(torch.norm(memory_diff, dim=-1))

	def compute_balanced_loss(
	self,
	decoder_loss: torch.Tensor,
	cross_modal_loss: torch.Tensor,
	vision_loss: Optional[torch.Tensor] = None,
	memory_loss: Optional[torch.Tensor] = None,
	step: int = 0,
	adaptive_controller=None
	) -> Dict[str, torch.Tensor]:
	"""Compute balanced multi-objective loss with adaptive scaling"""
	losses = {'decoder_loss': decoder_loss, 'cross_modal_loss': cross_modal_loss}

	if vision_loss is not None:
	losses['vision_loss'] = vision_loss
	if memory_loss is not None:
	losses['memory_loss'] = memory_loss

	if self.adaptive_loss_scaling:
	# Adaptive scaling based on loss magnitudes
	with torch.no_grad():
	# Compute relative loss scales
	decoder_scale = decoder_loss.detach()
	cross_modal_scale = cross_modal_loss.detach()

	# Prevent division by zero
	if decoder_scale > 1e-8:
	adaptive_cross_modal_weight = (decoder_scale / cross_modal_scale.clamp(min=1e-8)) * self.cross_modal_loss_weight
	else:
	adaptive_cross_modal_weight = self.cross_modal_loss_weight

	# Clamp adaptive weights
	adaptive_cross_modal_weight = torch.clamp(adaptive_cross_modal_weight, 0.01, 1.0)
	else:
	adaptive_cross_modal_weight = self.cross_modal_loss_weight

	# Apply loss scheduling (increase cross-modal importance over time)
	cross_modal_schedule = min(1.0, step / 50000) # Ramp up over 50k steps
	scheduled_cross_modal_weight = adaptive_cross_modal_weight * cross_modal_schedule

	# Compute weighted total loss
	total_loss = (
	self.text_loss_weight * decoder_loss +
	scheduled_cross_modal_weight * cross_modal_loss
	)

	if vision_loss is not None:
	total_loss += self.vision_loss_weight * vision_loss
	if memory_loss is not None:
	total_loss += self.memory_loss_weight * memory_loss

	losses.update({
	'total_loss': total_loss,
	'cross_modal_weight': scheduled_cross_modal_weight,
	'adaptive_weight': adaptive_cross_modal_weight if self.adaptive_loss_scaling else torch.tensor(0.0)
	})

	return losses

	def apply_encoder_freezing(self, step: int):
	"""Apply temporary encoder freezing based on training step"""
	self.current_step = step

	# Freeze text encoder if within freezing window
	freeze_text = step < self.freeze_text_encoder_steps
	for param in self.text_encoder.parameters():
	param.requires_grad = not freeze_text

	# Freeze vision encoder if within freezing window
	freeze_vision = step < self.freeze_vision_encoder_steps
	for param in self.vision_encoder.parameters():
	param.requires_grad = not freeze_vision

	return {
	'text_encoder_frozen': freeze_text,
	'vision_encoder_frozen': freeze_vision
	}

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	vision_features: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	mode: str = "train",
	step: int = 0,
	has_vision: Optional[torch.Tensor] = None,
	**kwargs
	) -> Union[Tuple, CausalLMOutput]:
	"""
	Forward pass through BitMar model with mixed vision/text batch support

	Args:
	has_vision: Boolean tensor [batch_size] indicating which samples have real vision features
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# CRITICAL FIX: Ensure input_ids are integers
	if input_ids.dtype != torch.long:
	input_ids = input_ids.long()

	# CRITICAL FIX: Ensure labels are integers if provided
	if labels is not None and labels.dtype != torch.long:
	labels = labels.long()

	if input_ids is None:
	raise ValueError("input_ids must be provided")

	batch_size, seq_len = input_ids.shape

	# Handle missing attention mask
	if attention_mask is None:
	attention_mask = torch.ones_like(input_ids, dtype=torch.float)

	# Ensure attention_mask is float
	if attention_mask.dtype != torch.float:
	attention_mask = attention_mask.float()

	# Handle missing vision features
	if vision_features is None:
	vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
	device=input_ids.device, dtype=torch.float32)

	# Validate input tensor dimensions
	expected_vision_dim = self.config.vision_encoder_dim
	if vision_features.dim() != 2 or vision_features.size(-1) != expected_vision_dim:
	if vision_features.dim() > 2:
	vision_features = vision_features.view(batch_size, -1)
	if vision_features.size(-1) != expected_vision_dim:
	# Pad or trim to expected dimension
	if vision_features.size(-1) > expected_vision_dim:
	vision_features = vision_features[:, :expected_vision_dim]
	else:
	padding = expected_vision_dim - vision_features.size(-1)
	vision_features = F.pad(vision_features, (0, padding))

	# Default has_vision to all True if not provided (backward compatibility)
	if has_vision is None:
	has_vision = torch.ones(batch_size, dtype=torch.bool, device=input_ids.device)

	# Apply encoder freezing
	freezing_status = {}
	if mode == "train":
	freezing_status = self.apply_encoder_freezing(step)

	# Encode text (always available)
	text_features, text_attention = self.encode_text(input_ids, attention_mask)

	# Encode vision (with masking for text-only samples)
	vision_latent = self.encode_vision(vision_features)

	# Mask vision features for text-only samples
	vision_mask = has_vision.float().unsqueeze(-1)
	vision_latent_masked = vision_latent * vision_mask

	# Cross-modal fusion
	fused_features, cross_attention = self.fusion(text_features, vision_latent_masked)

	# Create episodes
	if has_vision.any() and (~has_vision).any():
	# Mixed batch - use mixed episode creation
	episode = self.create_episode_mixed(
	text_features, vision_latent_masked, cross_attention, has_vision
	)
	else:
	# Uniform batch - use standard episode creation
	episode = self.create_episode(
	text_features, vision_latent_masked, cross_attention
	)

	# Episodic memory interaction
	if mode == "train":
	retrieved_memory, memory_attention = self.memory(episode, mode="read_write")
	else:
	retrieved_memory, memory_attention = self.memory(episode, mode="read")

	# Prepare decoder input
	memory_context = self.memory_to_decoder(retrieved_memory)
	memory_context_expanded = memory_context.unsqueeze(1).expand(-1, seq_len, -1)
	fused_with_memory = fused_features + memory_context_expanded
	decoder_input = self.decoder_input_proj(fused_with_memory)

	# Generate text using BitNet decoder
	decoder_outputs = self.text_decoder(
	inputs_embeds=decoder_input,
	attention_mask=attention_mask,
	labels=labels
	)

	# Compute losses if in training mode
	final_loss = None
	loss_dict = {}

	if mode == "train" and labels is not None:
	# Primary decoder loss
	decoder_loss = decoder_outputs['loss']

	# Cross-modal contrastive loss (only for samples with vision)
	cross_modal_loss = torch.tensor(0.0, device=input_ids.device)
	if has_vision.any():
	vision_indices = has_vision.nonzero(as_tuple=True)[0]
	if len(vision_indices) > 0:
	text_pooled = text_features[vision_indices].mean(dim=1)
	vision_for_loss = vision_latent[vision_indices]
	cross_modal_loss = self.compute_cross_modal_contrastive_loss(
	text_pooled, vision_for_loss, temperature=self.loss_scale_temperature
	)

	# Optional additional losses
	vision_loss = None
	memory_loss = self.compute_memory_consistency_loss(episode, retrieved_memory)

	# Compute balanced loss
	loss_dict = self.compute_balanced_loss(
	decoder_loss, cross_modal_loss, vision_loss, memory_loss, step
	)

	final_loss = loss_dict['total_loss']
	elif decoder_outputs.get('loss') is not None:
	final_loss = decoder_outputs['loss']

	# Prepare outputs
	if return_dict:
	output = CausalLMOutput(
	loss=final_loss,
	logits=decoder_outputs['logits'],
	hidden_states=fused_features if output_hidden_states else None,
	attentions=text_attention if output_attentions else None,
	)

	# Add additional outputs for analysis
	if mode == "train":
	for key, value in loss_dict.items():
	setattr(output, key, value)
	for key, value in freezing_status.items():
	setattr(output, key, value)

	return output
	else:
	outputs = (decoder_outputs['logits'],)
	if final_loss is not None:
	outputs = (final_loss,) + outputs
	if output_hidden_states:
	outputs = outputs + (fused_features,)
	if output_attentions:
	outputs = outputs + (text_attention,)
	return outputs

	def generate(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	vision_features: Optional[torch.Tensor] = None,
	max_length: int = 100,
	temperature: float = 0.7,
	top_p: float = 0.9,
	do_sample: bool = True,
	**kwargs
	) -> torch.LongTensor:
	"""Generate text given input text and vision features"""
	self.eval()

	batch_size = input_ids.size(0)
	device = input_ids.device

	# Handle missing vision features
	if vision_features is None:
	vision_features = torch.zeros(batch_size, self.config.vision_encoder_dim,
	device=device, dtype=torch.float32)

	# Handle attention mask
	if attention_mask is None:
	attention_mask = torch.ones_like(input_ids)

	generated_ids = input_ids.clone()
	current_attention_mask = attention_mask.clone()

	with torch.no_grad():
	for _ in range(max_length - input_ids.size(1)):
	# Get model outputs
	outputs = self.forward(
	input_ids=generated_ids,
	attention_mask=current_attention_mask,
	vision_features=vision_features,
	mode="inference",
	return_dict=True
	)

	# Get next token logits
	next_token_logits = outputs.logits[:, -1, :] / temperature

	if do_sample:
	# Apply top-p sampling
	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

	# Remove tokens with cumulative probability above the threshold
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0

	indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
	next_token_logits[indices_to_remove] = float('-inf')

	# Sample from the filtered distribution
	probs = F.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)
	else:
	# Greedy decoding
	next_token = next_token_logits.argmax(dim=-1, keepdim=True)

	# Append to generated sequence
	generated_ids = torch.cat([generated_ids, next_token], dim=-1)

	# Update attention mask
	current_attention_mask = torch.cat([
	current_attention_mask,
	torch.ones(batch_size, 1, device=device)
	], dim=-1)

	# Stop if EOS token is generated
	if (next_token == self.config.eos_token_id).all():
	break

	return generated_ids

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	vision_features=None,
	**kwargs
	):
	"""Prepare inputs for generation"""
	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"vision_features": vision_features,
	"use_cache": kwargs.get("use_cache", True),
	}


	# Register the model with transformers
	from transformers import AutoConfig, AutoModel, AutoModelForCausalLM

	AutoConfig.register("bitmar", BitMarConfig)
	AutoModel.register(BitMarConfig, BitMarModel)
	AutoModelForCausalLM.register(BitMarConfig, BitMarModel)