| # train_layerwise.yml — M3: Layerwise attention + mean pooling, no centering | |
| project_name: "deep-stylometry" | |
| data: | |
| ds_name: "halvest" | |
| batch_size: 64 | |
| tokenizer_name: "answerdotai/ModernBERT-base" | |
| max_length: 512 | |
| padding: "do_not_pad" | |
| truncation: "longest_first" | |
| add_special_tokens: true | |
| map_batch_size: 1000 | |
| load_from_cache_file: true | |
| subsets: ["base-2", "base-4", "base-6", "base-8", "base-10"] | |
| shuffle: true | |
| model: | |
| base_checkpoint: "answerdotai/ModernBERT-base" | |
| dropout: 0.1 | |
| expansion_ratio: 4 | |
| pooling_method: "layerwise" | |
| skip_list: false | |
| mean_center: false | |
| train: | |
| loss: "info_nce" | |
| tau: 0.5 | |
| margin: 0.32 | |
| lr: 3.0e-5 | |
| weight_decay: 0.1 | |
| device: "gpu" | |
| num_devices: 4 | |
| strategy: "ddp_find_unused_parameters_true" | |
| process_group_backend: "nccl" | |
| max_epochs: 1 | |
| val_check_interval: null | |
| check_val_every_n_epoch: null | |
| log_every_n_steps: 1 | |
| accumulate_grad_batches: 1 | |
| gradient_clip_val: null | |
| precision: "16-mixed" | |
| use_wandb: true | |
| log_model: false | |
| watch: "all" | |