| { |
| "run": { |
| "name": "final_c6_18l448_factorized_aggressive", |
| "artifacts_root": "artifacts/final_c6", |
| "resume": false, |
| "deterministic": false |
| }, |
| "distributed": { |
| "enabled": false, |
| "backend": "nccl" |
| }, |
| "preprocessing": { |
| "data_dir": "data", |
| "processed_dir": "data/processed_OWT", |
| "log_dir": "logs/preprocessing", |
| "train_split": 0.9, |
| "dataset_name": "openwebtext", |
| "dataset_config_name": null, |
| "dataset_split": "train", |
| "dataset_text_column": "text", |
| "dataset_repo_id": "huiting123/processedOWT", |
| "num_proc": 4, |
| "tokenization_num_proc": 0, |
| "tokenization_batch_size": 1000, |
| "tokenization_chunk_size": 100000, |
| "shard_write_batch_size": 5000, |
| "seed": 42, |
| "subset_size": 0, |
| "raw_data_path": null, |
| "test_data_path": null, |
| "skip_language_filter": false, |
| "skip_repetition_filter": false, |
| "skip_quality_filter": false, |
| "min_words": 100, |
| "max_words": 10000, |
| "max_non_ascii": 0.3, |
| "min_line_uniqueness": 0.7, |
| "min_sentence_uniqueness": 0.8, |
| "max_train_tokens": 0 |
| }, |
| "model": { |
| "vocab_size": 50304, |
| "n_layers": 18, |
| "n_heads": 7, |
| "n_kv_heads": 1, |
| "n_embd": 448, |
| "embedding_dim": 128, |
| "tie_embeddings": true, |
| "context_len": 1024, |
| "dropout": 0.0, |
| "bias": false, |
| "norm_type": "rmsnorm", |
| "norm_eps": 1e-05, |
| "positional_embedding": "rope", |
| "rope_theta": 10000.0, |
| "rope_fraction": 1.0, |
| "mlp_type": "swiglu", |
| "mlp_hidden_mult": 4.0, |
| "mlp_hidden_dim": 1024, |
| "qk_norm": false, |
| "block_style": "sequential" |
| }, |
| "training": { |
| "seed": 0, |
| "learning_rate": 0.00056, |
| "min_lr": 5.6e-05, |
| "weight_decay": 0.03, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "grad_clip": 1.0, |
| "max_iters": 92686, |
| "warmup_steps": 927, |
| "lr_schedule": "wsd", |
| "wsd_stable_frac": 0.85, |
| "batch_size": 4, |
| "gradient_accumulation_steps": 16, |
| "dtype": "float16", |
| "device": "cuda", |
| "eval_step_interval": 500, |
| "eval_batches": 20, |
| "log_interval": 10, |
| "max_checkpoints": 5 |
| }, |
| "inference": { |
| "checkpoint": null, |
| "prompt": "", |
| "max_tokens": 100, |
| "temperature": 1.0, |
| "seed": 0, |
| "device": "auto", |
| "leaderboard": false |
| }, |
| "post_training": { |
| "base_checkpoint": null, |
| "learning_rate": 1e-05, |
| "max_iters": 1000, |
| "checkpoint_dir": "checkpoints/post", |
| "log_dir": "logs/post" |
| }, |
| "evaluation": { |
| "checkpoint": null, |
| "batch_size": 4, |
| "device": "auto", |
| "log_dir": "logs/evaluation" |
| }, |
| "notifications": { |
| "enabled": false, |
| "smtp_host": "smtp.gmail.com", |
| "smtp_port": 587, |
| "smtp_user": "", |
| "to_addresses": [], |
| "cooldown_minutes": 5, |
| "periodic_status_hours": 4.0, |
| "disk_min_gb": 5.0 |
| } |
| } |