batch_size = 12 block_size = 1024 bias = False real_data = True seed = 1337 device = 'cuda' dtype = 'bfloat16' compile = True profile = False n_layer = 12 n_head = 12 n_embd = 768