| { | |
| "dataset_name": "HuggingFaceFW/fineweb-edu-score-2", | |
| "dataset_config": null, | |
| "split": "train", | |
| "text_column": "text", | |
| "streaming": true, | |
| "download_rows": null, | |
| "shuffle_buffer": 50000, | |
| "preprocessing_batch_size": 128, | |
| "iterable_shards_when_downloaded": 1024, | |
| "tokenizer_name": "gpt2", | |
| "block_size": 2048, | |
| "model_preset": "tiny_125m", | |
| "n_layer": null, | |
| "n_embd": null, | |
| "n_head": null, | |
| "resid_pdrop": 0.0, | |
| "embd_pdrop": 0.0, | |
| "attn_pdrop": 0.0, | |
| "gradient_checkpointing": false, | |
| "max_parameters": 600000000, | |
| "num_tpu_processes": 1, | |
| "per_device_batch_size": 8, | |
| "gradient_accumulation_steps": 4, | |
| "max_steps": 10000, | |
| "learning_rate": 0.0003, | |
| "weight_decay": 0.1, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "warmup_steps": 100, | |
| "max_grad_norm": 1.0, | |
| "num_workers": 0, | |
| "seed": 42, | |
| "log_every": 20, | |
| "save_every": 100, | |
| "output_dir": "/kaggle/working/tiny-lm-tpu", | |
| "resume_from": null, | |
| "push_to_hub": true, | |
| "hub_model_id": "moos124/tiny-lm-125m" | |
| } |