eoinf commited on
Commit
86d91ae
·
verified ·
1 Parent(s): d9fd107

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. checkpoints/metadata_000000032768.json +1 -0
  3. checkpoints/metadata_000000327680.json +1 -0
  4. checkpoints/metadata_000000360448.json +1 -0
  5. checkpoints/metadata_000000425984.json +1 -0
  6. checkpoints/metadata_000000458752.json +1 -0
  7. checkpoints/metadata_000000491520.json +1 -0
  8. checkpoints/metadata_000000557056.json +1 -0
  9. checkpoints/metadata_000000622592.json +1 -0
  10. checkpoints/metadata_000000688128.json +1 -0
  11. checkpoints/metadata_000000753664.json +1 -0
  12. checkpoints/metadata_000000819200.json +1 -0
  13. checkpoints/metadata_000000917504.json +1 -0
  14. checkpoints/metadata_000000983040.json +1 -0
  15. checkpoints/metadata_000001114112.json +1 -0
  16. checkpoints/metadata_000001212416.json +1 -0
  17. checkpoints/metadata_000001343488.json +1 -0
  18. checkpoints/metadata_000001474560.json +1 -0
  19. checkpoints/metadata_000001605632.json +1 -0
  20. checkpoints/metadata_000001769472.json +1 -0
  21. checkpoints/metadata_000001966080.json +1 -0
  22. checkpoints/metadata_000002162688.json +1 -0
  23. checkpoints/metadata_000002359296.json +1 -0
  24. checkpoints/metadata_000002621440.json +1 -0
  25. checkpoints/metadata_000002883584.json +1 -0
  26. checkpoints/metadata_000003178496.json +1 -0
  27. checkpoints/metadata_000003473408.json +1 -0
  28. checkpoints/metadata_000003833856.json +1 -0
  29. checkpoints/metadata_000004227072.json +1 -0
  30. checkpoints/metadata_000004653056.json +1 -0
  31. checkpoints/metadata_000005111808.json +1 -0
  32. checkpoints/metadata_000005603328.json +1 -0
  33. checkpoints/metadata_000006193152.json +1 -0
  34. checkpoints/metadata_000006782976.json +1 -0
  35. checkpoints/metadata_000007471104.json +1 -0
  36. checkpoints/metadata_000008224768.json +1 -0
  37. checkpoints/metadata_000009043968.json +1 -0
  38. checkpoints/metadata_000009961472.json +1 -0
  39. checkpoints/metadata_000010944512.json +1 -0
  40. checkpoints/metadata_000012058624.json +1 -0
  41. checkpoints/metadata_000013271040.json +1 -0
  42. checkpoints/metadata_000014581760.json +1 -0
  43. checkpoints/metadata_000016056320.json +1 -0
  44. checkpoints/metadata_000016384000.json +1 -0
  45. checkpoints/metadata_000017661952.json +1 -0
  46. checkpoints/metadata_000019431424.json +1 -0
  47. checkpoints/metadata_000021364736.json +1 -0
  48. checkpoints/metadata_000023494656.json +1 -0
  49. checkpoints/metadata_000025853952.json +1 -0
  50. checkpoints/metadata_000028442624.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20260227_134232-xgg2g05z/run-xgg2g05z.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/metadata_000000032768.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.37802791595459}
checkpoints/metadata_000000327680.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377717663851968}
checkpoints/metadata_000000360448.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377598009414987}
checkpoints/metadata_000000425984.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.377289953169777}
checkpoints/metadata_000000458752.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.37713998607216}
checkpoints/metadata_000000491520.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.376938789992566}
checkpoints/metadata_000000557056.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.376426411036498}
checkpoints/metadata_000000622592.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.375824231378044}
checkpoints/metadata_000000688128.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.375048947032507}
checkpoints/metadata_000000753664.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.374101782438617}
checkpoints/metadata_000000819200.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.373075912404031}
checkpoints/metadata_000000917504.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.371158385457182}
checkpoints/metadata_000000983040.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.369674488318335}
checkpoints/metadata_000001114112.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.366430059073076}
checkpoints/metadata_000001212416.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.363905413956502}
checkpoints/metadata_000001343488.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.360284077764357}
checkpoints/metadata_000001474560.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.356239770337442}
checkpoints/metadata_000001605632.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.351873034624731}
checkpoints/metadata_000001769472.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.346144734261483}
checkpoints/metadata_000001966080.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.338452705876588}
checkpoints/metadata_000002162688.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.329902555050262}
checkpoints/metadata_000002359296.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.320310576133869}
checkpoints/metadata_000002621440.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.306776841127029}
checkpoints/metadata_000002883584.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.291280306903579}
checkpoints/metadata_000003178496.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.272834772368334}
checkpoints/metadata_000003473408.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.251895300224053}
checkpoints/metadata_000003833856.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.223635847809346}
checkpoints/metadata_000004227072.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.18756683950548}
checkpoints/metadata_000004653056.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.143479916836643}
checkpoints/metadata_000005111808.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.09093993966237}
checkpoints/metadata_000005603328.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.02597446004432}
checkpoints/metadata_000006193152.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.937735552300891}
checkpoints/metadata_000006782976.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.839895830130107}
checkpoints/metadata_000007471104.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.712325186414569}
checkpoints/metadata_000008224768.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.559793606155328}
checkpoints/metadata_000009043968.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.37848964639277}
checkpoints/metadata_000009961472.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.159768193631905}
checkpoints/metadata_000010944512.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.916276569890632}
checkpoints/metadata_000012058624.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.639004766891704}
checkpoints/metadata_000013271040.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.342905782277333}
checkpoints/metadata_000014581760.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.04564433660178}
checkpoints/metadata_000016056320.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.753923420557582}
checkpoints/metadata_000016384000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.69892857486997}
checkpoints/metadata_000017661952.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.474507673806345}
checkpoints/metadata_000019431424.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.193267811340626}
checkpoints/metadata_000021364736.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.930870263745759}
checkpoints/metadata_000023494656.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.663814045828802}
checkpoints/metadata_000025853952.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.410301177091514}
checkpoints/metadata_000028442624.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_dmodel", "n_layers": 2, "d_model": 64, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.198303353421292}