Cosmos-T2A-low / model_config.json
wop's picture
Upload folder using huggingface_hub
247b1b6 verified
{
"model_family": "Cosmos T2A-low",
"model_name": "Cosmos T2A-low",
"model_class_name": "CosmosT2_Accelerate_LLM",
"hf_repo_id": "wop/Cosmos-T2A-low",
"tokenizer_name": "Qwen/Qwen2.5-0.5B",
"dataset_name": "wop/minitron-dataset",
"dataset_split": "train",
"dataset_row_limit": 288350,
"rows_used": 288350,
"stream_dataset": true,
"shuffle_buffer_size": 2048,
"train_val_fraction": 0.1,
"seed": 42,
"block_size": 1028,
"max_len": 1028,
"d_model": 64,
"n_layers": 4,
"n_heads": 4,
"n_kv_heads": 1,
"d_ff": 256,
"rope_base": 10000,
"dropout": 0.05,
"use_engram": true,
"engram_every": 2,
"engram_buckets": 128,
"engram_dim": 16,
"engram_order": 3,
"epochs": 1,
"batch_size": 2,
"lr": 0.0003,
"weight_decay": 0.1,
"warmup_steps": 50,
"grad_clip": 1.0,
"log_every_steps": 5000,
"eval_every_steps": 5000,
"plot_every_epochs": 5000,
"val_max_batches": 10,
"early_stopping_patience": 2,
"tokenization_batch_size": 256,
"num_workers": 1,
"pin_memory": true,
"dataloader_prefetch_factor": 4,
"persistent_workers": true,
"use_kv_cache": true,
"train_steps_per_epoch": 129757,
"loss_tokens_seen": 46651194,
"samples_seen": 222956
}