| { |
| "tokenizer": "HuggingFaceTB/SmolLM2-135M", |
| "dtype": "fp32", |
| "param_dtype": "bf16", |
| "num_train_epochs": 30, |
| "per_device_train_batch_size": 32, |
| "per_device_eval_batch_size": 32, |
| "gradient_accumulation_steps": 5, |
| "seed": 42, |
| "learning_rate": 0.0003, |
| "weight_decay": 0.01, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.999, |
| "warmup_ratio": 0.2, |
| "max_grad_norm": 1.0, |
| "logging_steps": 200, |
| "output_dir": "./artifacts/", |
| "logging_dir": "./artifacts/logs/", |
| "run_name": "train", |
| "best_metric_key": "perplexity", |
| "best_n_to_keep": 3, |
| "hub_model_id": "thiomajid/xLSTM-TPU", |
| "hub_private_repo": false, |
| "upload_message": "xLSTM is ready for image generation", |
| "train_dataset_url": "roneneldan/TinyStories", |
| "train_subset": null, |
| "train_split": "train", |
| "train_samples": 64000, |
| "eval_dataset_url": "roneneldan/TinyStories", |
| "eval_subset": null, |
| "eval_split": "validation", |
| "eval_samples": 3200, |
| "dataloader_drop_last": true, |
| "dataloader_num_workers": 4, |
| "worker_buffer_size": 2, |
| "text_column": "text", |
| "use_dataset_cache": true, |
| "dataset_cache_dir": "./.hf_data_cache", |
| "mesh_shape": [ |
| 8, |
| 1 |
| ], |
| "axis_names": [ |
| "dp", |
| "tp" |
| ] |
| } |