File size: 5,902 Bytes
a79c31e |
1 2 3 4 5 6 7 8 |
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 4 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx8.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (256, 256, 256, 256, 256), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.02, 0.06, 0.2, 0.2, 0.5), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 4 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx8.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (256, 256, 256, 256, 256), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.12, 0.12, 0.2, 0.2, 0.2), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 2 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx16.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.12, 0.12, 0.2, 0.2, 0.2), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 2 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx16.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.2, 0.4, 0.5, 0.5, 0.4), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 2 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx16.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.06, 0.06, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 2 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx32.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (1024, 1024, 1024, 1024, 1024), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.06, 0.06, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
name standard.tiny_32x4 | device cuda | compile True | data_dir data/tiny_stories_10m | should_randomize True | log_interval 10 | eval_interval 250 | eval_steps 100 | batch_size 128 | gradient_accumulation_steps 2 | learning_rate 0.001 | warmup_steps 0 | max_steps 5000 | decay_lr True | min_lr 0.0001 | weight_decay 0.1 | grad_clip 1.0 | sae_config {'name': 'standardx16.tiny_32x4', 'device': device(type='cuda'), 'compile': True, 'gpt_config': {'name': 'tiktoken_32x4', 'device': device(type='cuda'), 'compile': True, 'block_size': 128, 'vocab_size': 50257, 'n_layer': 4, 'n_head': 16, 'n_embd': 32}, 'n_features': (512, 512, 512, 512, 512), 'sae_variant': <SAEVariant.STANDARD: 'standard'>} | trainable_layers None | loss_coefficients {'sparsity': (0.06, 0.06, 0.1, 0.1, 0.1), 'regularization': None, 'downstream': None, 'bandwidth': None}
|