{
  "output_dir": "runs/chembl36_small_mask_mlm_lr_sweep/mask_standard__mlm_0p15__lr_4e-4",
  "tokenizer_vocab_path": "tokenizer/chembl36_selfies_2m_ape_max2_min3000.json",
  "tokenizer_metadata_path": "tokenizer/chembl36_selfies_2m_ape_max2_min3000.metadata.json",
  "dataset_name": "data/pretrain/chembl36_selfies",
  "selfies_column": "selfies",
  "train_split": "train",
  "validation_split": "valid",
  "use_validation_split": true,
  "data_dir": null,
  "data_files": null,
  "eval_size": 4096,
  "shuffle_buffer_size": 100000,
  "seed": 42,
  "val_split_mod": 100,
  "val_split_bucket": 0,
  "tokenizer_validation_samples": 1000,
  "unk_rate_threshold": 0.001,
  "truncation_warn_threshold": 0.05,
  "model_size": "small",
  "max_seq_length": 128,
  "mlm_probability": 0.15,
  "masking_strategy": "standard",
  "span_p_geom": 0.4,
  "span_max_length": 6,
  "heteroatom_start_weight": 2.0,
  "max_steps": 30000,
  "per_device_train_batch_size": 256,
  "per_device_eval_batch_size": 256,
  "gradient_accumulation_steps": 1,
  "learning_rate": 0.0004,
  "weight_decay": 0.01,
  "warmup_steps": 1500,
  "max_grad_norm": 1.0,
  "load_best_model_at_end": true,
  "metric_for_best_model": "eval_loss",
  "greater_is_better": false,
  "logging_steps": 100,
  "eval_steps": 5000,
  "save_steps": 5000,
  "save_total_limit": 2,
  "device_backend": "cuda",
  "bf16": true,
  "fp16": false,
  "num_workers": 4,
  "max_eval_batches": 16,
  "report_to": "tensorboard",
  "compute_masked_accuracy": true,
  "debug": false,
  "hf_login": false
}