| # Intelligent Tokenizer v6.2.0 Configuration | |
| # Progressive Splitting with GPT-5 Improvements | |
| model: | |
| name: "IntelligentTokenizerV62" | |
| version: "6.2.0" | |
| description: "Progressive splitting tokenizer with multi-level cross-attention" | |
| # Architecture parameters | |
| architecture: | |
| # Tokenizer settings | |
| tokenizer: | |
| content_size: 46 # Actual content bytes | |
| max_seq_len: 48 # Total with BOS/EOS | |
| chunk_overlap: 8 # Overlap for sliding window | |
| vocab_size: 260 # 256 bytes + 4 special tokens | |
| # Encoder settings (4 layers) | |
| encoder: | |
| hidden_dim: 1280 # Unified dimension | |
| num_heads: 16 # Query heads | |
| kv_heads: 2 # Key-Value heads (MQA - 8x reduction) | |
| num_layers: 4 # Total encoder layers | |
| dropout: 0.1 | |
| # TRUE Adaptive splitting (์์ ํ์ต ๊ธฐ๋ฐ, ํ๋์ฝ๋ฉ ์์) | |
| adaptive_splitting: | |
| min_tokens: 1 # ์ต์ 1๊ฐ ํ ํฐ (48:1 ์์ถ) | |
| max_tokens: 4 # ์ต๋ 4๊ฐ ํ ํฐ (12:1 ์์ถ, ์ฌ์ ํ BPE์ 3๋ฐฐ) | |
| # ์์ถ๋ฅ ์ ๋ชจ๋ธ์ด ์๋์ผ๋ก ํ์ต | |
| # 1 token = 48:1, 2 tokens = 24:1, 3 tokens = 16:1, 4 tokens = 12:1 | |
| learning_based: true # ์์ ํ์ต ๊ธฐ๋ฐ | |
| use_importance: true # ์ค์๋ ๊ธฐ๋ฐ ๋น๋์นญ ๋ถํ | |
| use_gumbel: true # Gumbel-Softmax๋ก ๋ฏธ๋ถ ๊ฐ๋ฅํ ์ ํ | |
| # Gate warmup (GPT suggestion) | |
| warmup: | |
| enabled: true | |
| steps: 1000 # Warmup steps for gates | |
| # Language clustering | |
| language: | |
| clusters: 128 # Reduced from 512 (GPT suggestion) | |
| embedding_dim: 256 | |
| # Decoder settings (6 layers) | |
| decoder: | |
| hidden_dim: 1280 # Match encoder | |
| num_heads: 16 # Query heads | |
| kv_heads: 2 # Key-Value heads (MQA) | |
| num_layers: 6 # 6 layers (reduced from 8) | |
| dropout: 0.1 | |
| # Memory optimization | |
| kv_cache: | |
| enabled: true | |
| max_cache_size: 512 # Maximum cached tokens | |
| # Cross-attention levels | |
| cross_attention: | |
| levels: [0, 1, 2, 3] # Which encoder layers to attend to | |
| fusion: "weighted_sum" # weighted_sum or concatenate | |
| # Generation settings | |
| generation: | |
| max_length: 512 | |
| temperature: 1.0 | |
| top_k: 50 | |
| top_p: 0.95 | |
| # Training configuration | |
| training: | |
| # Adaptive learning (์์ ๋์ ์กฐ์ ) | |
| adaptive_weights: | |
| # ์ด๊ธฐ๊ฐ๋ง ์ ๊ณต, ์ค์ ๋ก๋ ํ์ต ์ค ์๋ ์กฐ์ | |
| reconstruction: 1.0 # ๋ณต์ ํ์ง (๊ธฐ๋ณธ๊ฐ) | |
| compression: 2.0 # ์์ถ๋ฅ (16:1 ์ ์ง) | |
| boundary: 1.0 # ๊ฒฝ๊ณ ํ์ต (์ค์๋ ์ํฅ) | |
| # Dynamic adjustment | |
| dynamic_loss_scaling: true | |
| scale_by_performance: true | |
| # Optimizer settings | |
| optimizer: | |
| type: "AdamW" | |
| learning_rate: 0.00003 # ๋ ๋ฎ์ถค for batch 128 | |
| betas: [0.9, 0.95] # beta2 ๋ ๋ฎ์ถค (์์ ์ฑ) | |
| eps: 0.000001 # 1e-6 (๋ ์ฆ๊ฐ) | |
| weight_decay: 0.0005 # ๋ ๋ฎ์ถค | |
| # Scheduler settings | |
| scheduler: | |
| type: "CosineAnnealingLR" | |
| T_max: 100 | |
| eta_min: 0.000005 # ๋ ๋ฎ์ ์ต์๊ฐ | |
| warmup_steps: 2000 # warmup ๋๋ฆผ (1000 -> 2000) | |
| # Training parameters | |
| batch_size: 64 # GPU allows it (user using 128) | |
| gradient_accumulation_steps: 4 | |
| max_grad_norm: 0.3 # ๋ ๊ฐํ๊ฒ (1.0 -> 0.3) | |
| fp16: true | |
| gradient_checkpointing: true | |
| # Logging | |
| logging: | |
| log_interval: 100 | |
| eval_interval: 500 | |
| save_interval: 1 | |
| wandb: | |
| enabled: false | |
| project: "intelligent-tokenizer-v62" | |
| # Dataset configuration | |
| dataset: | |
| train_path: "data/" | |
| val_path: "data/" | |
| test_path: "data/" | |
| # Data processing | |
| preprocessing: | |
| max_length: 2048 # Maximum input length | |
| stride: 1536 # Stride for long sequences | |
| min_length: 48 # Minimum sequence length | |
| # Language distribution (for balanced sampling) | |
| languages: | |
| - code: "en" | |
| weight: 0.3 | |
| - code: "ko" | |
| weight: 0.2 | |
| - code: "zh" | |
| weight: 0.15 | |
| - code: "ja" | |
| weight: 0.1 | |
| - code: "es" | |
| weight: 0.05 | |
| - code: "fr" | |
| weight: 0.05 | |
| - code: "de" | |
| weight: 0.05 | |
| - code: "ru" | |
| weight: 0.05 | |
| - code: "ar" | |
| weight: 0.05 | |
| # Evaluation metrics | |
| evaluation: | |
| metrics: | |
| - compression_ratio # Target: 8-20x | |
| - reconstruction_accuracy # Target: >95% | |
| - boundary_precision # Target: >90% | |
| - language_accuracy # Target: >95% | |
| targets: | |
| compression_ratio: | |
| min: 12.0 # ์ต์ ์ ๊ฒฝ์ฐ๋ BPE์ 3๋ฐฐ (4 tokens) | |
| optimal: 24.0 # ํ๊ท ๋ชฉํ (2 tokens) | |
| max: 48.0 # ์ต์์ ๊ฒฝ์ฐ (1 token) | |
| reconstruction_accuracy: 0.90 | |
| boundary_precision: 0.90 | |
| language_accuracy: 0.90 | |
| # Hardware settings | |
| hardware: | |
| device: "cuda" | |
| num_workers: 4 | |
| pin_memory: true | |
| # Checkpoint settings | |
| checkpoint: | |
| save_dir: "checkpoints/v62/" | |
| resume_from: null | |
| save_best: true | |
| save_last: true | |
| max_checkpoints: 5 | |
| # Special tokens (must match tokenizer.py) | |
| special_tokens: | |
| PAD: 256 | |
| BOS: 257 | |
| EOS: 258 | |
| MASK: 259 | |
| # Experimental features | |
| experimental: | |
| # Gumbel-Softmax temperature annealing | |
| gumbel_annealing: | |
| enabled: true | |
| initial_temp: 1.0 | |
| final_temp: 0.3 # 0.1 โ 0.3 (๋๋ฌด ๋ฎ์ผ๋ฉด ๋ถ์์ ) | |
| anneal_rate: 0.9999 # 0.99995 โ 0.9999 (์กฐ๊ธ ๋ ๋น ๋ฅด๊ฒ) | |
| # Dynamic token allocation (์ง์ง ๋์ ) | |
| dynamic_tokens: | |
| enabled: true | |
| min_tokens: 1 # ์ต์ 1๊ฐ (48:1) | |
| max_tokens: 4 # ์ต๋ 4๊ฐ (12:1, BPE์ 3๋ฐฐ) | |
| # quality_threshold ์ ๊ฑฐ - ๋ชจ๋ธ์ด ์ค์ค๋ก ํ์ต | |
| # Boundary learning enhancements | |
| boundary_learning: | |
| utf8_aware: true | |
| word_aware: true | |
| phrase_aware: true | |
| # Memory optimization | |
| memory: | |
| gradient_checkpointing: true | |
| mixed_precision: true | |
| optimize_cuda: true | |
| clear_cache_interval: 100 |