fullrun / final /events.jsonl
huiting tang
Upload folder using huggingface_hub
ba9bb9a verified
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "model_summary", "total_params": 38227520, "trainable_params": 38227520, "weight_tied_lm_head": true, "timestamp": "2026-05-01T22:50:51.437763"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "config", "model": {"vocab_size": 50304, "n_layers": 18, "n_heads": 5, "n_kv_heads": 1, "n_embd": 320, "embedding_dim": null, "tie_embeddings": true, "context_len": 1024, "dropout": 0.0, "bias": false, "norm_type": "rmsnorm", "norm_eps": 1e-05, "positional_embedding": "rope", "rope_theta": 10000.0, "rope_fraction": 1.0, "mlp_type": "swiglu", "mlp_hidden_mult": 4.0, "mlp_hidden_dim": 1024, "qk_norm": false, "block_style": "sequential"}, "training": {"seed": 0, "learning_rate": 0.00066, "min_lr": 6.6e-05, "weight_decay": 0.03, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "max_iters": 200000, "warmup_steps": 15, "lr_schedule": "wsd", "wsd_stable_frac": 0.85, "batch_size": 8, "gradient_accumulation_steps": 32, "dtype": "float16", "device": "cuda", "eval_step_interval": 500, "eval_batches": 20, "log_interval": 10, "max_checkpoints": 5}, "distributed": {"enabled": true, "backend": "nccl"}, "timestamp": "2026-05-01T22:50:51.438015"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "checkpoint_saved", "step": 500, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0000500.pt", "timestamp": "2026-05-01T23:11:50.932143"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 500, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:11:52.239667"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "checkpoint_saved", "step": 1000, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0001000.pt", "timestamp": "2026-05-01T23:32:02.856076"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 1000, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:32:03.756367"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "checkpoint_saved", "step": 1500, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0001500.pt", "timestamp": "2026-05-01T23:52:13.760386"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 1500, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/best_ckpt.pt", "timestamp": "2026-05-01T23:52:14.707611"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "model_summary", "total_params": 38227520, "trainable_params": 38227520, "weight_tied_lm_head": true, "timestamp": "2026-05-02T12:58:34.169820"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "config", "model": {"vocab_size": 50304, "n_layers": 18, "n_heads": 5, "n_kv_heads": 1, "n_embd": 320, "embedding_dim": null, "tie_embeddings": true, "context_len": 1024, "dropout": 0.0, "bias": false, "norm_type": "rmsnorm", "norm_eps": 1e-05, "positional_embedding": "rope", "rope_theta": 10000.0, "rope_fraction": 1.0, "mlp_type": "swiglu", "mlp_hidden_mult": 4.0, "mlp_hidden_dim": 1024, "qk_norm": false, "block_style": "sequential"}, "training": {"seed": 0, "learning_rate": 0.00066, "min_lr": 6.6e-05, "weight_decay": 0.03, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "max_iters": 11586, "warmup_steps": 116, "lr_schedule": "wsd", "wsd_stable_frac": 0.85, "batch_size": 4, "gradient_accumulation_steps": 16, "dtype": "float16", "device": "cuda", "eval_step_interval": 1000, "eval_batches": 20, "log_interval": 10, "max_checkpoints": 5}, "distributed": {"enabled": true, "backend": "nccl"}, "timestamp": "2026-05-02T12:58:34.170168"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "resume", "checkpoint": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0001500.pt", "step": 1500, "best_val_loss": 4.013287210464478, "timestamp": "2026-05-02T12:58:34.653674"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "checkpoint_saved", "step": 2000, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0002000.pt", "timestamp": "2026-05-02T13:09:31.598847"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "checkpoint_saved", "step": 3000, "path": "artifacts/full_run/final_c2_18l320_standard/checkpoints/ckpt_step0003000.pt", "timestamp": "2026-05-02T13:32:36.344427"}