geolip-bert-8192 / config.json
AbstractPhil's picture
Upload config.json with huggingface_hub
ddc9d55 verified
{
"model": {
"bert_model": "google-bert/bert-large-uncased",
"hidden_size": 1024,
"freeze_bert": true,
"n_memory_tokens": 16,
"bank_size": 128,
"anchor_dim": 1024,
"n_bank_heads": 8,
"bank_cross_layers": 2,
"gate_type": "gru",
"extract_layers": [
2,
5,
8,
11,
14,
17,
20,
23
],
"layer_fusion": "learned",
"max_content_tokens": 480,
"segment_overlap": 64,
"max_position": 512,
"n_teachers": 2,
"teacher_hidden": 1024,
"cv_target": 0.2
},
"training": {
"max_documents": 50000,
"max_val_documents": 500,
"segment_length": 480,
"segment_overlap": 64,
"target_chain_segments": 16,
"max_segments": 16,
"min_segments": 6,
"modern_bert_model": "answerdotai/ModernBERT-large",
"longformer_model": "allenai/longformer-large-4096",
"modern_max_len": 8192,
"longformer_max_len": 4096,
"procrustes_n_samples": 500,
"epochs": 10,
"batch_size": 4,
"lr_bank": 0.002,
"lr_output": 0.0005,
"lr_proj": 0.001,
"min_lr": 1e-06,
"weight_decay": 0.01,
"grad_clip": 1.0,
"warmup_steps": 300,
"tbptt_segments": 0,
"modern_weight": 1.0,
"longformer_weight": 0.5,
"cv_weight": 0.05,
"temperature": 0.07,
"checkpoint_dir": "/home/claude/deep_bert_v3_checkpoints",
"tensorboard_dir": "/home/claude/deep_bert_v3_tb",
"log_every": 20,
"eval_every": 200,
"save_every_epoch": true
}
}