Initial upload: B2NL-IntelligentTokenizer v6.2.1 (Autoregressive Mode)

ffbd655 verified 7 months ago

6 kB

	# Intelligent Tokenizer v6.2.0 Configuration
	# Progressive Splitting with GPT-5 Improvements

	model:
	name: "IntelligentTokenizerV62"
	version: "6.2.0"
	description: "Progressive splitting tokenizer with multi-level cross-attention"

	# Architecture parameters
	architecture:
	# Tokenizer settings
	tokenizer:
	content_size: 46 # Actual content bytes
	max_seq_len: 48 # Total with BOS/EOS
	chunk_overlap: 8 # Overlap for sliding window
	vocab_size: 260 # 256 bytes + 4 special tokens

	# Encoder settings (4 layers)
	encoder:
	hidden_dim: 1280 # Unified dimension
	num_heads: 16 # Query heads
	kv_heads: 2 # Key-Value heads (MQA - 8x reduction)
	num_layers: 4 # Total encoder layers
	dropout: 0.1

	# TRUE Adaptive splitting (완전 학습 기반, 하드코딩 없음)
	adaptive_splitting:
	min_tokens: 1 # 최소 1개 토큰 (48:1 압축)
	max_tokens: 4 # 최대 4개 토큰 (12:1 압축, 여전히 BPE의 3배)
	# 압축률은 모델이 자동으로 학습
	# 1 token = 48:1, 2 tokens = 24:1, 3 tokens = 16:1, 4 tokens = 12:1
	learning_based: true # 완전 학습 기반
	use_importance: true # 중요도 기반 비대칭 분할
	use_gumbel: true # Gumbel-Softmax로 미분 가능한 선택

	# Gate warmup (GPT suggestion)
	warmup:
	enabled: true
	steps: 1000 # Warmup steps for gates

	# Language clustering
	language:
	clusters: 128 # Reduced from 512 (GPT suggestion)
	embedding_dim: 256

	# Decoder settings (6 layers)
	decoder:
	hidden_dim: 1280 # Match encoder
	num_heads: 16 # Query heads
	kv_heads: 2 # Key-Value heads (MQA)
	num_layers: 6 # 6 layers (reduced from 8)
	dropout: 0.1

	# Memory optimization
	kv_cache:
	enabled: true
	max_cache_size: 512 # Maximum cached tokens

	# Cross-attention levels
	cross_attention:
	levels: [0, 1, 2, 3] # Which encoder layers to attend to
	fusion: "weighted_sum" # weighted_sum or concatenate

	# Generation settings
	generation:
	max_length: 512
	temperature: 1.0
	top_k: 50
	top_p: 0.95

	# Training configuration
	training:
	# Adaptive learning (완전 동적 조정)
	adaptive_weights:
	# 초기값만 제공, 실제로는 학습 중 자동 조정
	reconstruction: 1.0 # 복원 품질 (기본값)
	compression: 2.0 # 압축률 (16:1 유지)
	boundary: 1.0 # 경계 학습 (중요도 상향)

	# Dynamic adjustment
	dynamic_loss_scaling: true
	scale_by_performance: true

	# Optimizer settings
	optimizer:
	type: "AdamW"
	learning_rate: 0.00003 # 더 낮춤 for batch 128
	betas: [0.9, 0.95] # beta2 더 낮춤 (안정성)
	eps: 0.000001 # 1e-6 (더 증가)
	weight_decay: 0.0005 # 더 낮춤

	# Scheduler settings
	scheduler:
	type: "CosineAnnealingLR"
	T_max: 100
	eta_min: 0.000005 # 더 낮은 최소값
	warmup_steps: 2000 # warmup 늘림 (1000 -> 2000)

	# Training parameters
	batch_size: 64 # GPU allows it (user using 128)
	gradient_accumulation_steps: 4
	max_grad_norm: 0.3 # 더 강하게 (1.0 -> 0.3)
	fp16: true
	gradient_checkpointing: true

	# Logging
	logging:
	log_interval: 100
	eval_interval: 500
	save_interval: 1
	wandb:
	enabled: false
	project: "intelligent-tokenizer-v62"

	# Dataset configuration
	dataset:
	train_path: "data/"
	val_path: "data/"
	test_path: "data/"

	# Data processing
	preprocessing:
	max_length: 2048 # Maximum input length
	stride: 1536 # Stride for long sequences
	min_length: 48 # Minimum sequence length

	# Language distribution (for balanced sampling)
	languages:
	- code: "en"
	weight: 0.3
	- code: "ko"
	weight: 0.2
	- code: "zh"
	weight: 0.15
	- code: "ja"
	weight: 0.1
	- code: "es"
	weight: 0.05
	- code: "fr"
	weight: 0.05
	- code: "de"
	weight: 0.05
	- code: "ru"
	weight: 0.05
	- code: "ar"
	weight: 0.05

	# Evaluation metrics
	evaluation:
	metrics:
	- compression_ratio # Target: 8-20x
	- reconstruction_accuracy # Target: >95%
	- boundary_precision # Target: >90%
	- language_accuracy # Target: >95%

	targets:
	compression_ratio:
	min: 12.0 # 최악의 경우도 BPE의 3배 (4 tokens)
	optimal: 24.0 # 평균 목표 (2 tokens)
	max: 48.0 # 최상의 경우 (1 token)
	reconstruction_accuracy: 0.90
	boundary_precision: 0.90
	language_accuracy: 0.90

	# Hardware settings
	hardware:
	device: "cuda"
	num_workers: 4
	pin_memory: true

	# Checkpoint settings
	checkpoint:
	save_dir: "checkpoints/v62/"
	resume_from: null
	save_best: true
	save_last: true
	max_checkpoints: 5

	# Special tokens (must match tokenizer.py)
	special_tokens:
	PAD: 256
	BOS: 257
	EOS: 258
	MASK: 259

	# Experimental features
	experimental:
	# Gumbel-Softmax temperature annealing
	gumbel_annealing:
	enabled: true
	initial_temp: 1.0
	final_temp: 0.3 # 0.1 → 0.3 (너무 낮으면 불안정)
	anneal_rate: 0.9999 # 0.99995 → 0.9999 (조금 더 빠르게)

	# Dynamic token allocation (진짜 동적)
	dynamic_tokens:
	enabled: true
	min_tokens: 1 # 최소 1개 (48:1)
	max_tokens: 4 # 최대 4개 (12:1, BPE의 3배)
	# quality_threshold 제거 - 모델이 스스로 학습

	# Boundary learning enhancements
	boundary_learning:
	utf8_aware: true
	word_aware: true
	phrase_aware: true

	# Memory optimization
	memory:
	gradient_checkpointing: true
	mixed_precision: true
	optimize_cuda: true
	clear_cache_interval: 100