ggunio's picture
Initial upload: B2NL-IntelligentTokenizer v6.2.1 (Autoregressive Mode)
ffbd655 verified
# Intelligent Tokenizer v6.2.0 Configuration
# Progressive Splitting with GPT-5 Improvements
model:
name: "IntelligentTokenizerV62"
version: "6.2.0"
description: "Progressive splitting tokenizer with multi-level cross-attention"
# Architecture parameters
architecture:
# Tokenizer settings
tokenizer:
content_size: 46 # Actual content bytes
max_seq_len: 48 # Total with BOS/EOS
chunk_overlap: 8 # Overlap for sliding window
vocab_size: 260 # 256 bytes + 4 special tokens
# Encoder settings (4 layers)
encoder:
hidden_dim: 1280 # Unified dimension
num_heads: 16 # Query heads
kv_heads: 2 # Key-Value heads (MQA - 8x reduction)
num_layers: 4 # Total encoder layers
dropout: 0.1
# TRUE Adaptive splitting (์™„์ „ ํ•™์Šต ๊ธฐ๋ฐ˜, ํ•˜๋“œ์ฝ”๋”ฉ ์—†์Œ)
adaptive_splitting:
min_tokens: 1 # ์ตœ์†Œ 1๊ฐœ ํ† ํฐ (48:1 ์••์ถ•)
max_tokens: 4 # ์ตœ๋Œ€ 4๊ฐœ ํ† ํฐ (12:1 ์••์ถ•, ์—ฌ์ „ํžˆ BPE์˜ 3๋ฐฐ)
# ์••์ถ•๋ฅ ์€ ๋ชจ๋ธ์ด ์ž๋™์œผ๋กœ ํ•™์Šต
# 1 token = 48:1, 2 tokens = 24:1, 3 tokens = 16:1, 4 tokens = 12:1
learning_based: true # ์™„์ „ ํ•™์Šต ๊ธฐ๋ฐ˜
use_importance: true # ์ค‘์š”๋„ ๊ธฐ๋ฐ˜ ๋น„๋Œ€์นญ ๋ถ„ํ• 
use_gumbel: true # Gumbel-Softmax๋กœ ๋ฏธ๋ถ„ ๊ฐ€๋Šฅํ•œ ์„ ํƒ
# Gate warmup (GPT suggestion)
warmup:
enabled: true
steps: 1000 # Warmup steps for gates
# Language clustering
language:
clusters: 128 # Reduced from 512 (GPT suggestion)
embedding_dim: 256
# Decoder settings (6 layers)
decoder:
hidden_dim: 1280 # Match encoder
num_heads: 16 # Query heads
kv_heads: 2 # Key-Value heads (MQA)
num_layers: 6 # 6 layers (reduced from 8)
dropout: 0.1
# Memory optimization
kv_cache:
enabled: true
max_cache_size: 512 # Maximum cached tokens
# Cross-attention levels
cross_attention:
levels: [0, 1, 2, 3] # Which encoder layers to attend to
fusion: "weighted_sum" # weighted_sum or concatenate
# Generation settings
generation:
max_length: 512
temperature: 1.0
top_k: 50
top_p: 0.95
# Training configuration
training:
# Adaptive learning (์™„์ „ ๋™์  ์กฐ์ •)
adaptive_weights:
# ์ดˆ๊ธฐ๊ฐ’๋งŒ ์ œ๊ณต, ์‹ค์ œ๋กœ๋Š” ํ•™์Šต ์ค‘ ์ž๋™ ์กฐ์ •
reconstruction: 1.0 # ๋ณต์› ํ’ˆ์งˆ (๊ธฐ๋ณธ๊ฐ’)
compression: 2.0 # ์••์ถ•๋ฅ  (16:1 ์œ ์ง€)
boundary: 1.0 # ๊ฒฝ๊ณ„ ํ•™์Šต (์ค‘์š”๋„ ์ƒํ–ฅ)
# Dynamic adjustment
dynamic_loss_scaling: true
scale_by_performance: true
# Optimizer settings
optimizer:
type: "AdamW"
learning_rate: 0.00003 # ๋” ๋‚ฎ์ถค for batch 128
betas: [0.9, 0.95] # beta2 ๋” ๋‚ฎ์ถค (์•ˆ์ •์„ฑ)
eps: 0.000001 # 1e-6 (๋” ์ฆ๊ฐ€)
weight_decay: 0.0005 # ๋” ๋‚ฎ์ถค
# Scheduler settings
scheduler:
type: "CosineAnnealingLR"
T_max: 100
eta_min: 0.000005 # ๋” ๋‚ฎ์€ ์ตœ์†Œ๊ฐ’
warmup_steps: 2000 # warmup ๋Š˜๋ฆผ (1000 -> 2000)
# Training parameters
batch_size: 64 # GPU allows it (user using 128)
gradient_accumulation_steps: 4
max_grad_norm: 0.3 # ๋” ๊ฐ•ํ•˜๊ฒŒ (1.0 -> 0.3)
fp16: true
gradient_checkpointing: true
# Logging
logging:
log_interval: 100
eval_interval: 500
save_interval: 1
wandb:
enabled: false
project: "intelligent-tokenizer-v62"
# Dataset configuration
dataset:
train_path: "data/"
val_path: "data/"
test_path: "data/"
# Data processing
preprocessing:
max_length: 2048 # Maximum input length
stride: 1536 # Stride for long sequences
min_length: 48 # Minimum sequence length
# Language distribution (for balanced sampling)
languages:
- code: "en"
weight: 0.3
- code: "ko"
weight: 0.2
- code: "zh"
weight: 0.15
- code: "ja"
weight: 0.1
- code: "es"
weight: 0.05
- code: "fr"
weight: 0.05
- code: "de"
weight: 0.05
- code: "ru"
weight: 0.05
- code: "ar"
weight: 0.05
# Evaluation metrics
evaluation:
metrics:
- compression_ratio # Target: 8-20x
- reconstruction_accuracy # Target: >95%
- boundary_precision # Target: >90%
- language_accuracy # Target: >95%
targets:
compression_ratio:
min: 12.0 # ์ตœ์•…์˜ ๊ฒฝ์šฐ๋„ BPE์˜ 3๋ฐฐ (4 tokens)
optimal: 24.0 # ํ‰๊ท  ๋ชฉํ‘œ (2 tokens)
max: 48.0 # ์ตœ์ƒ์˜ ๊ฒฝ์šฐ (1 token)
reconstruction_accuracy: 0.90
boundary_precision: 0.90
language_accuracy: 0.90
# Hardware settings
hardware:
device: "cuda"
num_workers: 4
pin_memory: true
# Checkpoint settings
checkpoint:
save_dir: "checkpoints/v62/"
resume_from: null
save_best: true
save_last: true
max_checkpoints: 5
# Special tokens (must match tokenizer.py)
special_tokens:
PAD: 256
BOS: 257
EOS: 258
MASK: 259
# Experimental features
experimental:
# Gumbel-Softmax temperature annealing
gumbel_annealing:
enabled: true
initial_temp: 1.0
final_temp: 0.3 # 0.1 โ†’ 0.3 (๋„ˆ๋ฌด ๋‚ฎ์œผ๋ฉด ๋ถˆ์•ˆ์ •)
anneal_rate: 0.9999 # 0.99995 โ†’ 0.9999 (์กฐ๊ธˆ ๋” ๋น ๋ฅด๊ฒŒ)
# Dynamic token allocation (์ง„์งœ ๋™์ )
dynamic_tokens:
enabled: true
min_tokens: 1 # ์ตœ์†Œ 1๊ฐœ (48:1)
max_tokens: 4 # ์ตœ๋Œ€ 4๊ฐœ (12:1, BPE์˜ 3๋ฐฐ)
# quality_threshold ์ œ๊ฑฐ - ๋ชจ๋ธ์ด ์Šค์Šค๋กœ ํ•™์Šต
# Boundary learning enhancements
boundary_learning:
utf8_aware: true
word_aware: true
phrase_aware: true
# Memory optimization
memory:
gradient_checkpointing: true
mixed_precision: true
optimize_cuda: true
clear_cache_interval: 100