| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "model_summary", "total_params": 38010240, "trainable_params": 38010240, "weight_tied_lm_head": true, "timestamp": "2026-05-01T22:46:16.950254"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "config", "model": {"vocab_size": 50304, "n_layers": 20, "n_heads": 6, "n_kv_heads": 2, "n_embd": 384, "embedding_dim": 128, "tie_embeddings": true, "context_len": 1024, "dropout": 0.0, "bias": false, "norm_type": "rmsnorm", "norm_eps": 1e-05, "positional_embedding": "rope", "rope_theta": 10000.0, "rope_fraction": 1.0, "mlp_type": "swiglu", "mlp_hidden_mult": 4.0, "mlp_hidden_dim": 1024, "qk_norm": false, "block_style": "sequential"}, "training": {"seed": 0, "learning_rate": 0.0006, "min_lr": 6e-05, "weight_decay": 0.03, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "max_iters": 200000, "warmup_steps": 14, "lr_schedule": "wsd", "wsd_stable_frac": 0.85, "batch_size": 8, "gradient_accumulation_steps": 32, "dtype": "float16", "device": "cuda", "eval_step_interval": 500, "eval_batches": 20, "log_interval": 10, "max_checkpoints": 5}, "distributed": {"enabled": true, "backend": "nccl"}, "timestamp": "2026-05-01T22:46:16.950715"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "checkpoint_saved", "step": 500, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/ckpt_step0000500.pt", "timestamp": "2026-05-01T23:13:29.361604"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 500, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:13:30.166388"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "checkpoint_saved", "step": 1000, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/ckpt_step0001000.pt", "timestamp": "2026-05-01T23:35:46.073076"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 1000, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:35:46.995061"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "checkpoint_saved", "step": 1500, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/ckpt_step0001500.pt", "timestamp": "2026-05-01T23:57:59.915111"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 1500, "path": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:58:00.792849"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "model_summary", "total_params": 38010240, "trainable_params": 38010240, "weight_tied_lm_head": true, "timestamp": "2026-05-02T00:19:16.981945"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "config", "model": {"vocab_size": 50304, "n_layers": 20, "n_heads": 6, "n_kv_heads": 2, "n_embd": 384, "embedding_dim": 128, "tie_embeddings": true, "context_len": 1024, "dropout": 0.0, "bias": false, "norm_type": "rmsnorm", "norm_eps": 1e-05, "positional_embedding": "rope", "rope_theta": 10000.0, "rope_fraction": 1.0, "mlp_type": "swiglu", "mlp_hidden_mult": 4.0, "mlp_hidden_dim": 1024, "qk_norm": false, "block_style": "sequential"}, "training": {"seed": 0, "learning_rate": 0.0006, "min_lr": 6e-05, "weight_decay": 0.03, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "max_iters": 200000, "warmup_steps": 14, "lr_schedule": "wsd", "wsd_stable_frac": 0.85, "batch_size": 8, "gradient_accumulation_steps": 8, "dtype": "float16", "device": "cuda", "eval_step_interval": 500, "eval_batches": 20, "log_interval": 10, "max_checkpoints": 5}, "distributed": {"enabled": true, "backend": "nccl"}, "timestamp": "2026-05-02T00:19:16.982286"} |
| {"run_name": "final_c4_20l384_factorized", "stage": "pretraining", "event": "resume", "checkpoint": "artifacts/full_run/final_c4_20l384_factorized/checkpoints/ckpt_step0001500.pt", "step": 1500, "best_val_loss": 4.2855634689331055, "timestamp": "2026-05-02T00:19:19.323639"} |
|
|