Upload training_config.yaml with huggingface_hub
Browse files- training_config.yaml +109 -0
training_config.yaml
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModernBERT-base LoRA Training Config for US Stablecoin Encoder
|
| 2 |
+
# ================================================================
|
| 3 |
+
# Purpose: Train SOTA encoder model for SIW SOW
|
| 4 |
+
# Data: 10K+ synthetic query triplets (independent of pruner data)
|
| 5 |
+
# Target: NDCG@10 > 0.70 (vs baseline ~0.65)
|
| 6 |
+
|
| 7 |
+
model:
|
| 8 |
+
base_model: "answerdotai/ModernBERT-base"
|
| 9 |
+
model_type: "modernbert"
|
| 10 |
+
output_dim: 768
|
| 11 |
+
max_seq_length: 512
|
| 12 |
+
pooling: "mean" # Mean pooling over token embeddings
|
| 13 |
+
normalize: true # L2 normalize embeddings
|
| 14 |
+
|
| 15 |
+
lora:
|
| 16 |
+
enabled: true
|
| 17 |
+
rank: 16
|
| 18 |
+
alpha: 32
|
| 19 |
+
dropout: 0.1
|
| 20 |
+
target_modules:
|
| 21 |
+
- "Wqkv" # ModernBERT: Combined QKV projection
|
| 22 |
+
- "Wo" # ModernBERT: Output projection
|
| 23 |
+
modules_to_save: []
|
| 24 |
+
bias: "none"
|
| 25 |
+
|
| 26 |
+
training:
|
| 27 |
+
output_dir: "/workspace/models/encoder_modernbert_lora"
|
| 28 |
+
num_epochs: 5
|
| 29 |
+
batch_size: 16
|
| 30 |
+
gradient_accumulation_steps: 2
|
| 31 |
+
effective_batch_size: 32 # batch_size * gradient_accumulation_steps
|
| 32 |
+
|
| 33 |
+
learning_rate: 2.0e-4
|
| 34 |
+
warmup_ratio: 0.1
|
| 35 |
+
weight_decay: 0.01
|
| 36 |
+
max_grad_norm: 1.0
|
| 37 |
+
|
| 38 |
+
scheduler: "cosine"
|
| 39 |
+
optimizer: "adamw"
|
| 40 |
+
|
| 41 |
+
fp16: true
|
| 42 |
+
gradient_checkpointing: true
|
| 43 |
+
|
| 44 |
+
logging_steps: 25
|
| 45 |
+
eval_steps: 100
|
| 46 |
+
save_steps: 200
|
| 47 |
+
save_total_limit: 3
|
| 48 |
+
|
| 49 |
+
early_stopping:
|
| 50 |
+
enabled: true
|
| 51 |
+
patience: 3
|
| 52 |
+
metric: "ndcg@10"
|
| 53 |
+
mode: "max"
|
| 54 |
+
threshold: 0.75 # Stop early if NDCG > 0.75
|
| 55 |
+
|
| 56 |
+
loss:
|
| 57 |
+
type: "multiple_negatives_ranking" # InfoNCE
|
| 58 |
+
temperature: 0.05
|
| 59 |
+
use_in_batch_negatives: true
|
| 60 |
+
|
| 61 |
+
data:
|
| 62 |
+
train_path: "/workspace/data/labels/encoder_triplets_fresh_v3.jsonl"
|
| 63 |
+
corpus_path: "/workspace/data/corpus.json"
|
| 64 |
+
|
| 65 |
+
val_split: 0.2
|
| 66 |
+
shuffle: true
|
| 67 |
+
seed: 42
|
| 68 |
+
|
| 69 |
+
num_hard_negatives: 3
|
| 70 |
+
|
| 71 |
+
evaluation:
|
| 72 |
+
metrics:
|
| 73 |
+
- "ndcg@10"
|
| 74 |
+
- "mrr@10"
|
| 75 |
+
- "recall@100"
|
| 76 |
+
- "precision@10"
|
| 77 |
+
|
| 78 |
+
eval_corpus_path: "/workspace/data/corpus.json"
|
| 79 |
+
eval_on_train_end: true
|
| 80 |
+
|
| 81 |
+
hardware:
|
| 82 |
+
device: "cuda"
|
| 83 |
+
num_workers: 4
|
| 84 |
+
pin_memory: true
|
| 85 |
+
|
| 86 |
+
huggingface:
|
| 87 |
+
hub_model_id: "sugiv/modernbert-us-stablecoin-lora"
|
| 88 |
+
push_to_hub: false # Set to true only if NDCG > 0.70
|
| 89 |
+
hub_token: null # Will read from HF_TOKEN env var
|
| 90 |
+
private: false
|
| 91 |
+
|
| 92 |
+
logging:
|
| 93 |
+
log_dir: "/workspace/logs"
|
| 94 |
+
log_file: "encoder_modernbert_training.log"
|
| 95 |
+
tensorboard: true
|
| 96 |
+
wandb:
|
| 97 |
+
enabled: false
|
| 98 |
+
project: "stablecoin-encoder"
|
| 99 |
+
entity: null
|
| 100 |
+
|
| 101 |
+
metadata:
|
| 102 |
+
project: "US Stablecoin Regulatory Intelligence"
|
| 103 |
+
task: "Document Retrieval (Encoder)"
|
| 104 |
+
purpose: "SIW SOW - Produce best encoder for stablecoin data"
|
| 105 |
+
data_source: "10K+ synthetic queries from 38 regulatory PDFs"
|
| 106 |
+
data_method: "Qwen API synthetic generation"
|
| 107 |
+
baseline_model: "BAAI/bge-m3"
|
| 108 |
+
target_improvement: "+5-10% NDCG@10"
|
| 109 |
+
training_date: "2026-03-10"
|