{ "model": { "bert_model": "google-bert/bert-large-uncased", "hidden_size": 1024, "freeze_bert": true, "n_memory_tokens": 16, "bank_size": 128, "anchor_dim": 1024, "n_bank_heads": 8, "bank_cross_layers": 2, "gate_type": "gru", "extract_layers": [ 2, 5, 8, 11, 14, 17, 20, 23 ], "layer_fusion": "learned", "max_content_tokens": 480, "segment_overlap": 64, "max_position": 512, "n_teachers": 2, "teacher_hidden": 1024, "cv_target": 0.2 }, "training": { "max_documents": 50000, "max_val_documents": 500, "segment_length": 480, "segment_overlap": 64, "target_chain_segments": 16, "max_segments": 16, "min_segments": 6, "modern_bert_model": "answerdotai/ModernBERT-large", "longformer_model": "allenai/longformer-large-4096", "modern_max_len": 8192, "longformer_max_len": 4096, "procrustes_n_samples": 500, "epochs": 10, "batch_size": 4, "lr_bank": 0.002, "lr_output": 0.0005, "lr_proj": 0.001, "min_lr": 1e-06, "weight_decay": 0.01, "grad_clip": 1.0, "warmup_steps": 300, "tbptt_segments": 0, "modern_weight": 1.0, "longformer_weight": 0.5, "cv_weight": 0.05, "temperature": 0.07, "checkpoint_dir": "/home/claude/deep_bert_v3_checkpoints", "tensorboard_dir": "/home/claude/deep_bert_v3_tb", "log_every": 20, "eval_every": 200, "save_every_epoch": true } }