eoinf commited on
Commit
9a87e42
·
verified ·
1 Parent(s): fdb6e27

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. checkpoints/metadata_000000032768.json +1 -0
  3. checkpoints/metadata_000000327680.json +1 -0
  4. checkpoints/metadata_000000360448.json +1 -0
  5. checkpoints/metadata_000000425984.json +1 -0
  6. checkpoints/metadata_000000458752.json +1 -0
  7. checkpoints/metadata_000000491520.json +1 -0
  8. checkpoints/metadata_000000557056.json +1 -0
  9. checkpoints/metadata_000000622592.json +1 -0
  10. checkpoints/metadata_000000688128.json +1 -0
  11. checkpoints/metadata_000000753664.json +1 -0
  12. checkpoints/metadata_000000819200.json +1 -0
  13. checkpoints/metadata_000000917504.json +1 -0
  14. checkpoints/metadata_000000983040.json +1 -0
  15. checkpoints/metadata_000001114112.json +1 -0
  16. checkpoints/metadata_000001212416.json +1 -0
  17. checkpoints/metadata_000001343488.json +1 -0
  18. checkpoints/metadata_000001474560.json +1 -0
  19. checkpoints/metadata_000001605632.json +1 -0
  20. checkpoints/metadata_000001769472.json +1 -0
  21. checkpoints/metadata_000001966080.json +1 -0
  22. checkpoints/metadata_000002162688.json +1 -0
  23. checkpoints/metadata_000002359296.json +1 -0
  24. checkpoints/metadata_000002621440.json +1 -0
  25. checkpoints/metadata_000002883584.json +1 -0
  26. checkpoints/metadata_000003178496.json +1 -0
  27. checkpoints/metadata_000003473408.json +1 -0
  28. checkpoints/metadata_000003833856.json +1 -0
  29. checkpoints/metadata_000004227072.json +1 -0
  30. checkpoints/metadata_000004653056.json +1 -0
  31. checkpoints/metadata_000005111808.json +1 -0
  32. checkpoints/metadata_000005603328.json +1 -0
  33. checkpoints/metadata_000006193152.json +1 -0
  34. checkpoints/metadata_000006782976.json +1 -0
  35. checkpoints/metadata_000007471104.json +1 -0
  36. checkpoints/metadata_000008224768.json +1 -0
  37. checkpoints/metadata_000009043968.json +1 -0
  38. checkpoints/metadata_000009961472.json +1 -0
  39. checkpoints/metadata_000010944512.json +1 -0
  40. checkpoints/metadata_000012058624.json +1 -0
  41. checkpoints/metadata_000013271040.json +1 -0
  42. checkpoints/metadata_000014581760.json +1 -0
  43. checkpoints/metadata_000016056320.json +1 -0
  44. checkpoints/metadata_000016384000.json +1 -0
  45. checkpoints/metadata_000017661952.json +1 -0
  46. checkpoints/metadata_000019431424.json +1 -0
  47. checkpoints/metadata_000021364736.json +1 -0
  48. checkpoints/metadata_000023494656.json +1 -0
  49. checkpoints/metadata_000025853952.json +1 -0
  50. checkpoints/metadata_000028442624.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20260123_000739-ravyxvau/run-ravyxvau.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/metadata_000000032768.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 1, "tokens_seen": 32768, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.480770111083984}
checkpoints/metadata_000000327680.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10, "tokens_seen": 327680, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.419450851831872}
checkpoints/metadata_000000360448.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 11, "tokens_seen": 360448, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.402350478597977}
checkpoints/metadata_000000425984.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 13, "tokens_seen": 425984, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.367933364336938}
checkpoints/metadata_000000458752.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 14, "tokens_seen": 458752, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.343392533768213}
checkpoints/metadata_000000491520.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15, "tokens_seen": 491520, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.315834396165515}
checkpoints/metadata_000000557056.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 17, "tokens_seen": 557056, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.25264280478912}
checkpoints/metadata_000000622592.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 19, "tokens_seen": 622592, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.17958748250888}
checkpoints/metadata_000000688128.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 21, "tokens_seen": 688128, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.098195073786213}
checkpoints/metadata_000000753664.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 23, "tokens_seen": 753664, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 10.011746053778088}
checkpoints/metadata_000000819200.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 25, "tokens_seen": 819200, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.922323651359266}
checkpoints/metadata_000000917504.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 28, "tokens_seen": 917504, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.782112715541203}
checkpoints/metadata_000000983040.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 30, "tokens_seen": 983040, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.686813668029203}
checkpoints/metadata_000001114112.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 34, "tokens_seen": 1114112, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.492712140297815}
checkpoints/metadata_000001212416.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 37, "tokens_seen": 1212416, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.345396167842498}
checkpoints/metadata_000001343488.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 41, "tokens_seen": 1343488, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 9.148554591424418}
checkpoints/metadata_000001474560.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 45, "tokens_seen": 1474560, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 8.953220451709976}
checkpoints/metadata_000001605632.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 49, "tokens_seen": 1605632, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 8.761438956350457}
checkpoints/metadata_000001769472.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 54, "tokens_seen": 1769472, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 8.526760136469726}
checkpoints/metadata_000001966080.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 60, "tokens_seen": 1966080, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 8.254082460089649}
checkpoints/metadata_000002162688.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 66, "tokens_seen": 2162688, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 7.9922968512592485}
checkpoints/metadata_000002359296.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 72, "tokens_seen": 2359296, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 7.742491946012804}
checkpoints/metadata_000002621440.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 80, "tokens_seen": 2621440, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 7.426925934138872}
checkpoints/metadata_000002883584.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 88, "tokens_seen": 2883584, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 7.13152583675178}
checkpoints/metadata_000003178496.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 97, "tokens_seen": 3178496, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 6.822417279639748}
checkpoints/metadata_000003473408.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 106, "tokens_seen": 3473408, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 6.535011311370024}
checkpoints/metadata_000003833856.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 117, "tokens_seen": 3833856, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 6.214238950151746}
checkpoints/metadata_000004227072.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 129, "tokens_seen": 4227072, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 5.899472816247962}
checkpoints/metadata_000004653056.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 142, "tokens_seen": 4653056, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 5.593093818453131}
checkpoints/metadata_000005111808.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 156, "tokens_seen": 5111808, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 5.3021928933927045}
checkpoints/metadata_000005603328.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 171, "tokens_seen": 5603328, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 5.02936738891022}
checkpoints/metadata_000006193152.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 189, "tokens_seen": 6193152, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 4.74866017072963}
checkpoints/metadata_000006782976.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 207, "tokens_seen": 6782976, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 4.510208590982572}
checkpoints/metadata_000007471104.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 228, "tokens_seen": 7471104, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 4.278545038403737}
checkpoints/metadata_000008224768.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 251, "tokens_seen": 8224768, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 4.071446024593858}
checkpoints/metadata_000009043968.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 276, "tokens_seen": 9043968, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.8905378271839237}
checkpoints/metadata_000009961472.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 304, "tokens_seen": 9961472, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.7318137851109854}
checkpoints/metadata_000010944512.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 334, "tokens_seen": 10944512, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.600999131436243}
checkpoints/metadata_000012058624.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 368, "tokens_seen": 12058624, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.4885535250504063}
checkpoints/metadata_000013271040.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 405, "tokens_seen": 13271040, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.3980110362806557}
checkpoints/metadata_000014581760.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 445, "tokens_seen": 14581760, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.3258698381577174}
checkpoints/metadata_000016056320.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 490, "tokens_seen": 16056320, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.262849572849983}
checkpoints/metadata_000016384000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 500, "tokens_seen": 16384000, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.2490659841521863}
checkpoints/metadata_000017661952.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 539, "tokens_seen": 17661952, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.182954958030111}
checkpoints/metadata_000019431424.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 593, "tokens_seen": 19431424, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 3.02650953510771}
checkpoints/metadata_000021364736.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 652, "tokens_seen": 21364736, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 2.7855169718340793}
checkpoints/metadata_000023494656.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 717, "tokens_seen": 23494656, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 2.5643637997826074}
checkpoints/metadata_000025853952.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 789, "tokens_seen": 25853952, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 2.390097479846048}
checkpoints/metadata_000028442624.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 868, "tokens_seen": 28442624, "config": {"model_name": "syn_obs", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/trigrams_syn_k1000_support_7.0", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 50000000, "lr_hidden": 0.002, "lr_vector": 0.001, "lr_schedule": "constant_with_warmup", "warmup_tokens": 1000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 30, "max_steps": 1525}, "train_loss_ewma": 2.2609948052943043}