{ "model_family": "Cosmos T2A-low", "model_name": "Cosmos T2A-low", "model_class_name": "CosmosT2_Accelerate_LLM", "hf_repo_id": "wop/Cosmos-T2A-low", "tokenizer_name": "Qwen/Qwen2.5-0.5B", "dataset_name": "wop/minitron-dataset", "dataset_split": "train", "dataset_row_limit": 288350, "rows_used": 288350, "stream_dataset": true, "shuffle_buffer_size": 2048, "train_val_fraction": 0.1, "seed": 42, "block_size": 1028, "max_len": 1028, "d_model": 64, "n_layers": 4, "n_heads": 4, "n_kv_heads": 1, "d_ff": 256, "rope_base": 10000, "dropout": 0.05, "use_engram": true, "engram_every": 2, "engram_buckets": 128, "engram_dim": 16, "engram_order": 3, "epochs": 1, "batch_size": 2, "lr": 0.0003, "weight_decay": 0.1, "warmup_steps": 50, "grad_clip": 1.0, "log_every_steps": 5000, "eval_every_steps": 5000, "plot_every_epochs": 5000, "val_max_batches": 10, "early_stopping_patience": 2, "tokenization_batch_size": 256, "num_workers": 1, "pin_memory": true, "dataloader_prefetch_factor": 4, "persistent_workers": true, "use_kv_cache": true, "train_steps_per_epoch": 129757, "loss_tokens_seen": 46651194, "samples_seen": 222956 }