sugiv
/

modernbert-us-stablecoin-encoder

+# ModernBERT-base LoRA Training Config for US Stablecoin Encoder
+# ================================================================
+# Purpose: Train SOTA encoder model for SIW SOW
+# Data: 10K+ synthetic query triplets (independent of pruner data)
+# Target: NDCG@10 > 0.70 (vs baseline ~0.65)
+model:
+  base_model: "answerdotai/ModernBERT-base"
+  model_type: "modernbert"
+  output_dim: 768
+  max_seq_length: 512
+  pooling: "mean"  # Mean pooling over token embeddings
+  normalize: true  # L2 normalize embeddings
+lora:
+  enabled: true
+  rank: 16
+  alpha: 32
+  dropout: 0.1
+  target_modules:
+    - "Wqkv"  # ModernBERT: Combined QKV projection
+    - "Wo"    # ModernBERT: Output projection
+  modules_to_save: []
+  bias: "none"
+training:
+  output_dir: "/workspace/models/encoder_modernbert_lora"
+  num_epochs: 5
+  batch_size: 16
+  gradient_accumulation_steps: 2
+  effective_batch_size: 32  # batch_size * gradient_accumulation_steps
+  learning_rate: 2.0e-4
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  scheduler: "cosine"
+  optimizer: "adamw"
+  fp16: true
+  gradient_checkpointing: true
+  logging_steps: 25
+  eval_steps: 100
+  save_steps: 200
+  save_total_limit: 3
+  early_stopping:
+    enabled: true
+    patience: 3
+    metric: "ndcg@10"
+    mode: "max"
+    threshold: 0.75  # Stop early if NDCG > 0.75
+loss:
+  type: "multiple_negatives_ranking"  # InfoNCE
+  temperature: 0.05
+  use_in_batch_negatives: true
+data:
+  train_path: "/workspace/data/labels/encoder_triplets_fresh_v3.jsonl"
+  corpus_path: "/workspace/data/corpus.json"
+  val_split: 0.2
+  shuffle: true
+  seed: 42
+  num_hard_negatives: 3
+evaluation:
+  metrics:
+    - "ndcg@10"
+    - "mrr@10"
+    - "recall@100"
+    - "precision@10"
+  eval_corpus_path: "/workspace/data/corpus.json"
+  eval_on_train_end: true
+hardware:
+  device: "cuda"
+  num_workers: 4
+  pin_memory: true
+huggingface:
+  hub_model_id: "sugiv/modernbert-us-stablecoin-lora"
+  push_to_hub: false  # Set to true only if NDCG > 0.70
+  hub_token: null  # Will read from HF_TOKEN env var
+  private: false
+logging:
+  log_dir: "/workspace/logs"
+  log_file: "encoder_modernbert_training.log"
+  tensorboard: true
+  wandb:
+    enabled: false
+    project: "stablecoin-encoder"
+    entity: null
+metadata:
+  project: "US Stablecoin Regulatory Intelligence"
+  task: "Document Retrieval (Encoder)"
+  purpose: "SIW SOW - Produce best encoder for stablecoin data"
+  data_source: "10K+ synthetic queries from 38 regulatory PDFs"
+  data_method: "Qwen API synthetic generation"
+  baseline_model: "BAAI/bge-m3"
+  target_improvement: "+5-10% NDCG@10"
+  training_date: "2026-03-10"