{ "d_model": 384, "n_layers": 8, "n_heads": 8, "n_kv_heads": 4, "d_ff": 1024, "K": 16, "max_len": 512, "entropy_reg": 0.02, "batch_size": 16, "seq_len": 256, "lr": 0.0003, "steps": 30000, "warmup_steps": 500, "val_every": 500, "save_every": 2500, "probe_steps": [ 1500, 5000, 15000, 30000 ], "vocab_size": 50257, "n_params": 37107592, "architecture": "CDM_V2_code", "device": "cuda", "dataset": "bigcode/starcoderdata:python" }