# train_layerwise.yml — M3: Layerwise attention + mean pooling, no centering project_name: "deep-stylometry" data: ds_name: "halvest" batch_size: 64 tokenizer_name: "answerdotai/ModernBERT-base" max_length: 512 padding: "do_not_pad" truncation: "longest_first" add_special_tokens: true map_batch_size: 1000 load_from_cache_file: true subsets: ["base-2", "base-4", "base-6", "base-8", "base-10"] shuffle: true model: base_checkpoint: "answerdotai/ModernBERT-base" dropout: 0.1 expansion_ratio: 4 pooling_method: "layerwise" skip_list: false mean_center: false train: loss: "info_nce" tau: 0.5 margin: 0.32 lr: 3.0e-5 weight_decay: 0.1 device: "gpu" num_devices: 4 strategy: "ddp_find_unused_parameters_true" process_group_backend: "nccl" max_epochs: 1 val_check_interval: null check_val_every_n_epoch: null log_every_n_steps: 1 accumulate_grad_batches: 1 gradient_clip_val: null precision: "16-mixed" use_wandb: true log_model: false watch: "all"