sugiv commited on
Commit
45ade03
·
verified ·
1 Parent(s): 4e4d1c2

Upload training_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_config.yaml +109 -0
training_config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ModernBERT-base LoRA Training Config for US Stablecoin Encoder
2
+ # ================================================================
3
+ # Purpose: Train SOTA encoder model for SIW SOW
4
+ # Data: 10K+ synthetic query triplets (independent of pruner data)
5
+ # Target: NDCG@10 > 0.70 (vs baseline ~0.65)
6
+
7
+ model:
8
+ base_model: "answerdotai/ModernBERT-base"
9
+ model_type: "modernbert"
10
+ output_dim: 768
11
+ max_seq_length: 512
12
+ pooling: "mean" # Mean pooling over token embeddings
13
+ normalize: true # L2 normalize embeddings
14
+
15
+ lora:
16
+ enabled: true
17
+ rank: 16
18
+ alpha: 32
19
+ dropout: 0.1
20
+ target_modules:
21
+ - "Wqkv" # ModernBERT: Combined QKV projection
22
+ - "Wo" # ModernBERT: Output projection
23
+ modules_to_save: []
24
+ bias: "none"
25
+
26
+ training:
27
+ output_dir: "/workspace/models/encoder_modernbert_lora"
28
+ num_epochs: 5
29
+ batch_size: 16
30
+ gradient_accumulation_steps: 2
31
+ effective_batch_size: 32 # batch_size * gradient_accumulation_steps
32
+
33
+ learning_rate: 2.0e-4
34
+ warmup_ratio: 0.1
35
+ weight_decay: 0.01
36
+ max_grad_norm: 1.0
37
+
38
+ scheduler: "cosine"
39
+ optimizer: "adamw"
40
+
41
+ fp16: true
42
+ gradient_checkpointing: true
43
+
44
+ logging_steps: 25
45
+ eval_steps: 100
46
+ save_steps: 200
47
+ save_total_limit: 3
48
+
49
+ early_stopping:
50
+ enabled: true
51
+ patience: 3
52
+ metric: "ndcg@10"
53
+ mode: "max"
54
+ threshold: 0.75 # Stop early if NDCG > 0.75
55
+
56
+ loss:
57
+ type: "multiple_negatives_ranking" # InfoNCE
58
+ temperature: 0.05
59
+ use_in_batch_negatives: true
60
+
61
+ data:
62
+ train_path: "/workspace/data/labels/encoder_triplets_fresh_v3.jsonl"
63
+ corpus_path: "/workspace/data/corpus.json"
64
+
65
+ val_split: 0.2
66
+ shuffle: true
67
+ seed: 42
68
+
69
+ num_hard_negatives: 3
70
+
71
+ evaluation:
72
+ metrics:
73
+ - "ndcg@10"
74
+ - "mrr@10"
75
+ - "recall@100"
76
+ - "precision@10"
77
+
78
+ eval_corpus_path: "/workspace/data/corpus.json"
79
+ eval_on_train_end: true
80
+
81
+ hardware:
82
+ device: "cuda"
83
+ num_workers: 4
84
+ pin_memory: true
85
+
86
+ huggingface:
87
+ hub_model_id: "sugiv/modernbert-us-stablecoin-lora"
88
+ push_to_hub: false # Set to true only if NDCG > 0.70
89
+ hub_token: null # Will read from HF_TOKEN env var
90
+ private: false
91
+
92
+ logging:
93
+ log_dir: "/workspace/logs"
94
+ log_file: "encoder_modernbert_training.log"
95
+ tensorboard: true
96
+ wandb:
97
+ enabled: false
98
+ project: "stablecoin-encoder"
99
+ entity: null
100
+
101
+ metadata:
102
+ project: "US Stablecoin Regulatory Intelligence"
103
+ task: "Document Retrieval (Encoder)"
104
+ purpose: "SIW SOW - Produce best encoder for stablecoin data"
105
+ data_source: "10K+ synthetic queries from 38 regulatory PDFs"
106
+ data_method: "Qwen API synthetic generation"
107
+ baseline_model: "BAAI/bge-m3"
108
+ target_improvement: "+5-10% NDCG@10"
109
+ training_date: "2026-03-10"