dataset: max_length: 128 name: monology/pile-uncopyrighted split: train model: device: cuda name: EleutherAI/pythia-410m transcoding: batch_size: 512 bias: true debug: false hidden_multiplier: 4 layer_idx: 2 learning_rate: 0.02 model_type: Bilinear n_batches: 20 n_batches_full: 3000 optimizer_type: Muon