xLSTM-TPU / trainer_config.json
thiomajid's picture
xLSTM is ready for image generation
669c9b0 verified
{
"tokenizer": "HuggingFaceTB/SmolLM2-135M",
"dtype": "fp32",
"param_dtype": "bf16",
"num_train_epochs": 30,
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 5,
"seed": 42,
"learning_rate": 0.0003,
"weight_decay": 0.01,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"warmup_ratio": 0.2,
"max_grad_norm": 1.0,
"logging_steps": 200,
"output_dir": "./artifacts/",
"logging_dir": "./artifacts/logs/",
"run_name": "train",
"best_metric_key": "perplexity",
"best_n_to_keep": 3,
"hub_model_id": "thiomajid/xLSTM-TPU",
"hub_private_repo": false,
"upload_message": "xLSTM is ready for image generation",
"train_dataset_url": "roneneldan/TinyStories",
"train_subset": null,
"train_split": "train",
"train_samples": 64000,
"eval_dataset_url": "roneneldan/TinyStories",
"eval_subset": null,
"eval_split": "validation",
"eval_samples": 3200,
"dataloader_drop_last": true,
"dataloader_num_workers": 4,
"worker_buffer_size": 2,
"text_column": "text",
"use_dataset_cache": true,
"dataset_cache_dir": "./.hf_data_cache",
"mesh_shape": [
8,
1
],
"axis_names": [
"dp",
"tp"
]
}