model_name: Llama-3.2-1B out_dir: /home/aiops/zhuty/litgpt_out/pretrain/llama3-1b-proweb-50ksteps-diffusion precision: bf16-mixed initial_checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B resume: auto data: class_path: litgpt.data.TextFiles init_args: train_data_path: /home/aiops/zhuty/cont_data/proweb/train val_data_path: /home/aiops/zhuty/cont_data/proweb/test seed: 42 num_workers: 0 add_eos: true train: save_interval: 2500 save_optimizer_state: true max_optimizer_state: 1 log_interval: 1 global_batch_size: 1024 micro_batch_size: 8 lr_warmup_fraction: 0.01 max_steps: 50000 max_seq_length: 1024 max_norm: 1.0 min_lr: 5.0e-06 eval: interval: 1000 max_iters: 100 initial_validation: true final_validation: true evaluate_example: first num_generation_examples: 1 calculate_exact_match: false log: project: mathcont optimizer: class_path: torch.optim.AdamW init_args: lr: 5.0e-05 weight_decay: 0.1 betas: - 0.9 - 0.95 devices: auto num_nodes: 1 tokenizer_dir: checkpoints/meta-llama/Llama-3.2-1B logger_name: wandb seed: 42 compiler: torch executors: - sdpa - torchcompile - torch strategy: fsdp diffusion: true mask_token_id: 811 sampling_eps: 0.001 intradoc: false block_diffusion: false block_size: 8 timestep_sampler: uniform