JorgeAV
/

MR-JEPA

@@ -2,90 +2,16 @@
   "run_name": "phase4_smollm2",
   "phase": "4",
   "decoder": "SmolLM2-135M-Instruct",
-  "bridge": "LLaVA-1.5 MLP (768->576->576)",
   "backbone": "dinov3",
   "K": 3,
   "stage1_epochs": 3,
-  "stage2_epochs_completed": 3,
-  "stage2_epochs_target": 7,
   "bridge_lr": 0.001,
   "lm_lr": 2e-05,
   "core_lr": 5e-05,
   "label_smoothing": 0.1,
   "num_evidence_tokens": 8,
-  "gen_weight": 2.0,
-  "eval_history": [
-    {
-      "epoch": "S1E0",
-      "mc": 58.5,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 14.6
-    },
-    {
-      "epoch": "S1E1",
-      "mc": 58.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 14.5
-    },
-    {
-      "epoch": "S1E2",
-      "mc": 58.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 14.5
-    },
-    {
-      "epoch": "S2E0",
-      "mc": 58.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 14.5
-    },
-    {
-      "epoch": "S2E1",
-      "mc": 55.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 13.8
-    },
-    {
-      "epoch": "S2E2",
-      "mc": 56.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 14.0
-    },
-    {
-      "epoch": "S2E3",
-      "mc": 55.0,
-      "docvqa_anls": 0.0,
-      "chartqa_ra": 0.0,
-      "textvqa_vqa": 0.0,
-      "composite": 13.8
-    }
-  ],
-  "key_findings": [
-    "SmolLM2 pre-trained decoder generates real English text (vs degenerate repetition in Phase 3.x)",
-    "Gen losses dropped dramatically: ChartQA 4.98->3.40, DocVQA 7.53->4.52, TextVQA 10.33->3.97",
-    "Predictions are diverse and topically relevant but too verbose for exact-match metrics",
-    "Bridge MLP successfully projects JEPA 768d -> SmolLM2 576d space",
-    "MC accuracy holds at 55-58% (slight catastrophic forgetting during Stage 2)",
-    "Next steps: post-processing, shorter generation, greedy decoding"
-  ],
-  "improvements_over_phase3": [
-    "Replaced random-init 4-layer decoder (7M params) with SmolLM2-135M (135M params)",
-    "LLaVA-1.5 two-layer MLP bridge for nonlinear alignment",
-    "Label smoothing (0.1) for smoother gradient signal",
-    "Nucleus sampling + repetition penalty in eval",
-    "Two-stage training: bridge-only then full fine-tuning",
-    "SmolLM2 tokenizer (49K vocab) vs Qwen3 (152K vocab)"
-  ]
 }

   "run_name": "phase4_smollm2",
   "phase": "4",
   "decoder": "SmolLM2-135M-Instruct",
+  "bridge": "LLaVA-1.5 MLP (768\u2192576\u2192576)",
   "backbone": "dinov3",
   "K": 3,
+  "best_composite_score": 14.625,
   "stage1_epochs": 3,
+  "stage2_epochs": 7,
   "bridge_lr": 0.001,
   "lm_lr": 2e-05,
   "core_lr": 5e-05,
   "label_smoothing": 0.1,
   "num_evidence_tokens": 8,
+  "gen_weight": 2.0
 }