Initial upload: B2NL-IntelligentTokenizer v6.2.1 (Autoregressive Mode)

ffbd655 verified 7 months ago

21.3 kB

	"""
	Intelligent Tokenizer v6.2.0 - Unified Model
	Integrates encoder, decoder, and tokenizer with all GPT improvements
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Dict, List, Optional, Tuple, Union
	import math

	# Import our components
	try:
	from .encoder import EncoderV62
	from .decoder import DecoderV62
	from .tokenizer import ByteTokenizerV62
	except ImportError:
	# For standalone testing
	from encoder import EncoderV62
	from decoder import DecoderV62
	from tokenizer import ByteTokenizerV62


	class IntelligentTokenizerV62(nn.Module):
	"""
	Complete v6.2.0 model with progressive splitting and optimizations

	Key features:
	- 48-byte chunks (46+2 with BOS/EOS)
	- Progressive splitting: 48→1→N→M tokens
	- Multi-level cross-attention
	- KV cache optimization (8x reduction)
	- All GPT-5 improvements integrated
	"""

	def __init__(self, config: Optional[Dict] = None):
	super().__init__()

	# Default configuration
	self.config = config or {}

	# Model components
	self.tokenizer = ByteTokenizerV62(config)
	self.encoder = EncoderV62(config)
	self.decoder = DecoderV62(config)

	# Training configuration
	self.compression_weight = 0.1
	self.reconstruction_weight = 0.1
	self.boundary_weight = 0.1

	# Monitoring
	self.register_buffer('training_step', torch.tensor(0))
	self.register_buffer('current_epoch', torch.tensor(0))

	def forward(self,
	input_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	labels: torch.Tensor = None,
	text: str = None,
	return_loss: bool = True,
	temperature: float = 1.0) -> Dict[str, torch.Tensor]:
	"""
	Unified forward pass

	Args:
	input_ids: Pre-tokenized input (optional)
	attention_mask: Attention mask (optional)
	labels: Target labels for training (optional)
	text: Raw text input (alternative to input_ids)
	return_loss: Whether to compute loss
	temperature: Temperature for Gumbel-Softmax in encoder

	Returns:
	Dictionary with model outputs
	"""
	# Handle text input
	if text is not None:
	encoded = self.tokenizer.encode(text, add_special_tokens=True)
	input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
	attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']

	# Handle string passed as input_ids (common mistake)
	if isinstance(input_ids, str):
	text = input_ids
	encoded = self.tokenizer.encode(text, add_special_tokens=True)
	input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
	attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']

	# Ensure tensors are on the right device
	device = next(self.parameters()).device
	if input_ids is not None and torch.is_tensor(input_ids):
	input_ids = input_ids.to(device)
	if attention_mask is not None and torch.is_tensor(attention_mask):
	attention_mask = attention_mask.to(device)
	if labels is not None and torch.is_tensor(labels):
	labels = labels.to(device)

	# Encoder forward pass with temperature for Gumbel annealing
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask,
	temperature=temperature
	)

	# Decoder forward pass
	if labels is not None:
	# Training mode with teacher forcing (GPT suggestion: shift by 1)
	# Input: labels[:-1], Target: labels[1:]
	decoder_input = labels[:, :-1] if labels.dim() > 1 else labels[:-1]
	decoder_mask = attention_mask[:, :-1] if attention_mask is not None and attention_mask.dim() > 1 else None

	decoder_outputs = self.decoder(
	encoder_all_hidden=encoder_outputs['all_hidden_states'],
	decoder_input_ids=decoder_input,
	attention_mask=decoder_mask
	)
	else:
	# Inference mode (without teacher forcing)
	# For now, fallback to using input as labels for stable training
	# TODO: Implement proper autoregressive generation
	if return_loss and input_ids is not None:
	labels = input_ids # Use input as both input and target
	decoder_input = labels[:, :-1] if labels.dim() > 1 else labels[:-1]
	decoder_mask = attention_mask[:, :-1] if attention_mask is not None and attention_mask.dim() > 1 else None

	decoder_outputs = self.decoder(
	encoder_all_hidden=encoder_outputs['all_hidden_states'],
	decoder_input_ids=decoder_input,
	attention_mask=decoder_mask
	)
	else:
	decoder_outputs = self.decoder(
	encoder_all_hidden=encoder_outputs['all_hidden_states'],
	decoder_input_ids=None,
	attention_mask=attention_mask
	)

	# Combine outputs with prefix to avoid key collision (GPT suggestion)
	outputs = {}
	for key, value in encoder_outputs.items():
	outputs[f'enc_{key}'] = value
	for key, value in decoder_outputs.items():
	outputs[f'dec_{key}'] = value

	# Compute loss if requested
	if return_loss and labels is not None:
	loss = self.compute_loss(outputs, labels, attention_mask)
	outputs['loss'] = loss

	return outputs

	def compute_loss(self,
	outputs: Dict[str, torch.Tensor],
	labels: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""
	Compute combined loss with multiple objectives

	Components:
	1. Reconstruction loss (cross-entropy)
	2. Compression loss (encourage higher compression)
	3. Boundary loss (boundary prediction accuracy)
	"""
	losses = {}

	# 1. Reconstruction loss (GPT suggestion: use shifted targets)
	if 'dec_logits' in outputs:
	logits = outputs['dec_logits']

	# Shift targets for next-token prediction
	target_labels = labels[:, 1:] if labels.dim() > 1 else labels[1:]
	target_mask = attention_mask[:, 1:] if attention_mask is not None and attention_mask.dim() > 1 else None

	# Reshape for cross-entropy
	batch_size, seq_len, vocab_size = logits.shape
	logits_flat = logits.reshape(-1, vocab_size)
	labels_flat = target_labels.reshape(-1)

	# Mask out padding (GPT suggestion: use bool mask)
	if target_mask is not None:
	mask_flat = target_mask.reshape(-1).bool()
	reconstruction_loss = F.cross_entropy(
	logits_flat[mask_flat],
	labels_flat[mask_flat],
	ignore_index=self.tokenizer.PAD,
	label_smoothing=0.1 # Added label smoothing
	)
	else:
	reconstruction_loss = F.cross_entropy(
	logits_flat,
	labels_flat,
	ignore_index=self.tokenizer.PAD,
	label_smoothing=0.1
	)

	losses['reconstruction'] = reconstruction_loss * self.reconstruction_weight

	# 2. Compression loss (GPT suggestion: use proper device tensor creation)
	if 'enc_compression_ratio' in outputs:
	# Target compression ratio (e.g., 24:1 as per config)
	target_ratio = 24.0
	current_ratio = outputs['enc_compression_ratio']

	# Create tensors on same device (GPT suggestion)
	if isinstance(current_ratio, (int, float)):
	current_ratio_tensor = labels.new_tensor(current_ratio, dtype=torch.float32)
	else:
	current_ratio_tensor = current_ratio.float()
	target_ratio_tensor = labels.new_tensor(target_ratio, dtype=torch.float32)

	# Penalize deviation from target (use smooth L1 to avoid explosion)
	compression_loss = F.smooth_l1_loss(
	current_ratio_tensor,
	target_ratio_tensor,
	beta=2.0 # Transition point from L2 to L1
	)

	losses['compression'] = compression_loss * self.compression_weight

	# 3. Boundary loss (GPT suggestion: more meaningful boundary learning)
	if 'enc_boundaries' in outputs and outputs['enc_boundaries'] is not None:
	boundary_scores = outputs['enc_boundaries']

	# Boundary sparsity + smoothness (GPT suggestion)
	# Encourage sparse but clear boundaries
	boundary_probs = torch.sigmoid(boundary_scores)

	# Sparsity loss (boundaries should be rare)
	sparsity_loss = boundary_probs.mean() * 0.1

	# Smoothness loss (adjacent boundaries should be different)
	if boundary_scores.size(1) > 1:
	diff = boundary_scores[:, 1:] - boundary_scores[:, :-1]
	smoothness_loss = (diff ** 2).mean() * 0.01
	else:
	smoothness_loss = 0.0

	boundary_loss = sparsity_loss + smoothness_loss

	losses['boundary'] = boundary_loss * self.boundary_weight

	# Combine all losses
	total_loss = sum(losses.values())

	# Store individual losses for monitoring
	self.last_losses = losses

	return total_loss

	def generate(self,
	text: str = None,
	input_ids: torch.Tensor = None,
	max_length: int = 48,
	temperature: float = 0.1,
	top_k: int = 10,
	top_p: float = 0.95) -> str:
	"""
	Generate/reconstruct text

	Args:
	text: Input text to encode and reconstruct
	input_ids: Pre-encoded input
	max_length: Maximum generation length
	temperature: Sampling temperature
	top_k: Top-k sampling
	top_p: Top-p (nucleus) sampling

	Returns:
	Reconstructed/generated text
	"""
	# Encode input if text is provided (GPT suggestion: handle multi-chunk properly)
	chunk_positions = None
	if text is not None:
	# Check if text needs chunking
	if len(text.encode('utf-8')) > self.tokenizer.content_size:
	encoded = self.tokenizer.encode(text, add_special_tokens=True, return_chunks=True)
	chunk_positions = encoded.get('chunk_positions', None)
	else:
	encoded = self.tokenizer.encode(text, add_special_tokens=True)

	input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
	attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
	else:
	attention_mask = (input_ids != self.tokenizer.PAD).bool() # GPT suggestion: bool mask

	# Move to device
	device = next(self.parameters()).device
	input_ids = input_ids.to(device)
	attention_mask = attention_mask.to(device)

	# Encode
	with torch.no_grad():
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask
	)

	# Prepare all hidden states for decoder
	if 'all_hidden_states' in encoder_outputs:
	encoder_all_hidden = encoder_outputs['all_hidden_states']
	else:
	compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
	encoder_all_hidden = [compressed] * 4

	# Autoregressive generation (fixed version)
	batch_size = input_ids.size(0)

	# Start with BOS token
	generated_ids = torch.full((batch_size, 1), self.tokenizer.BOS, device=device)

	for step in range(max_length - 1):
	with torch.no_grad():
	# Decode current sequence
	decoder_outputs = self.decoder(
	encoder_all_hidden=encoder_all_hidden,
	decoder_input_ids=generated_ids,
	attention_mask=torch.ones_like(generated_ids),
	use_cache=False
	)

	# Get next token prediction
	logits = decoder_outputs['logits'][:, -1, :] / temperature

	# Top-k filtering
	if top_k > 0:
	indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
	logits[indices_to_remove] = float('-inf')

	# Sample next token
	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	# Append to generated sequence
	generated_ids = torch.cat([generated_ids, next_token], dim=1)

	# Check for EOS
	if (next_token == self.tokenizer.EOS).all():
	break

	# Decode to text (GPT suggestion: proper multi-chunk reconstruction)
	if generated_ids.dim() > 2 and chunk_positions is not None:
	# Multi-chunk output with positions
	text = self.tokenizer.reconstruct(
	generated_ids,
	positions=chunk_positions,
	overlap=self.tokenizer.chunk_overlap
	)
	elif generated_ids.dim() > 2:
	# Multi-chunk without positions (fallback)
	text = self.tokenizer.reconstruct(generated_ids)
	else:
	# Single sequence
	text = self.tokenizer.decode(generated_ids[0] if generated_ids.dim() > 1 else generated_ids)

	return text

	def compress(self, text: str) -> Dict[str, Union[torch.Tensor, float]]:
	"""
	Compress text and return compression statistics

	Args:
	text: Input text to compress

	Returns:
	Dictionary with compressed representation and statistics
	"""
	# Encode text
	encoded = self.tokenizer.encode(text, add_special_tokens=True)
	input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
	attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']

	# Move to device
	device = next(self.parameters()).device
	input_ids = input_ids.to(device)
	attention_mask = attention_mask.to(device)

	# Get compressed representation
	with torch.no_grad():
	encoder_outputs = self.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask
	)

	return {
	'compressed': encoder_outputs['compressed'],
	'num_tokens': encoder_outputs['num_tokens'],
	'compression_ratio': encoder_outputs['compression_ratio'],
	'original_bytes': len(text.encode('utf-8')),
	'compressed_size': encoder_outputs['num_tokens'] * 2 # Approximate bytes
	}

	def update_training_state(self, epoch: int, step: int = 0, reconstruction_loss: float = None):
	"""
	Update training state - adaptive, not phase-based

	Args:
	epoch: Current epoch
	step: Current training step
	reconstruction_loss: Current reconstruction quality
	"""
	self.current_epoch = torch.tensor(epoch)
	self.training_step = torch.tensor(step)

	# Update encoder warmup (gates only)
	self.encoder.set_warmup_step(step)

	# Adaptive weight adjustment based on performance
	if reconstruction_loss is not None:
	# If reconstruction is poor, increase its weight
	if reconstruction_loss > 1.0:
	self.reconstruction_weight = 1.0
	self.compression_weight = 0.1 # Less compression focus
	else:
	# Good reconstruction, can focus on compression
	self.reconstruction_weight = 0.5
	self.compression_weight = 0.1

	# Boundary weight stays moderate
	self.boundary_weight = 0.1

	# Let encoder know about reconstruction quality
	self.encoder.adaptive_compression_control(reconstruction_loss)
	else:
	# Default balanced weights
	self.reconstruction_weight = 0.5
	self.compression_weight = 0.1
	self.boundary_weight = 0.1

	def get_model_stats(self) -> Dict[str, float]:
	"""
	Get model statistics for monitoring

	Returns:
	Dictionary with various model statistics
	"""
	stats = {}

	# Encoder stats (GPT suggestion: already prefixed)
	encoder_stats = self.encoder.get_monitoring_stats()
	stats.update({f'encoder_{k}': v for k, v in encoder_stats.items()})

	# Decoder memory stats
	decoder_memory = self.decoder.get_memory_usage()
	stats.update({f'decoder_{k}': v for k, v in decoder_memory.items()})

	# Loss stats (if available) - check for tensor items
	if hasattr(self, 'last_losses'):
	for k, v in self.last_losses.items():
	if isinstance(v, torch.Tensor):
	stats[f'loss_{k}'] = v.item() if v.numel() == 1 else v.mean().item()
	else:
	stats[f'loss_{k}'] = float(v)

	# Training info
	stats['current_epoch'] = self.current_epoch.item()
	stats['training_step'] = self.training_step.item()

	return stats

	def save_checkpoint(self, path: str):
	"""
	Save model checkpoint

	Args:
	path: Path to save checkpoint
	"""
	checkpoint = {
	'model_state_dict': self.state_dict(),
	'config': self.config,
	'epoch': self.current_epoch.item(),
	'step': self.training_step.item(),
	'stats': self.get_model_stats()
	}
	torch.save(checkpoint, path)
	print(f"Checkpoint saved to {path}")

	@classmethod
	def from_checkpoint(cls, path: str, device: str = 'cuda'):
	"""
	Load model from checkpoint

	Args:
	path: Path to checkpoint
	device: Device to load model on

	Returns:
	Loaded model instance
	"""
	checkpoint = torch.load(path, map_location=device)

	# Create model with saved config
	model = cls(checkpoint.get('config', {}))
	model.load_state_dict(checkpoint['model_state_dict'])
	model.to(device)

	# Restore training state
	if 'epoch' in checkpoint:
	model.current_epoch = torch.tensor(checkpoint['epoch'])
	if 'step' in checkpoint:
	model.training_step = torch.tensor(checkpoint['step'])

	print(f"Model loaded from {path} (Epoch {checkpoint.get('epoch', 0)})")
	return model


	if __name__ == "__main__":
	# Test unified model
	print("Testing Intelligent Tokenizer v6.2.0")

	# Create model
	model = IntelligentTokenizerV62()
	print(f"Model created with {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")

	# Test texts
	test_texts = [
	"Hello, world!",
	"안녕하세요, 만나서 반갑습니다. 오늘 날씨가 좋네요!",
	"今天天气很好。",
	]

	for text in test_texts:
	print(f"\nInput: {text}")

	# Compress
	compression = model.compress(text)
	print(f" Compression ratio: {compression['compression_ratio']:.1f}:1")
	print(f" Tokens: {compression['num_tokens']}")

	# Generate (reconstruct)
	reconstructed = model.generate(text, temperature=0.1)
	print(f" Reconstructed: {reconstructed}")

	# Get model stats
	stats = model.get_model_stats()
	print(f"\nModel Statistics:")
	for key, value in stats.items():
	if isinstance(value, float):
	print(f" {key}: {value:.4f}")
	else:
	print(f" {key}: {value}")