JorgeAV
/

MR-JEPA

@@ -2,16 +2,135 @@
   "run_name": "phase4_smollm2",
   "phase": "4",
   "decoder": "SmolLM2-135M-Instruct",
-  "bridge": "LLaVA-1.5 MLP (768\u2192576\u2192576)",
   "backbone": "dinov3",
   "K": 3,
-  "best_composite_score": 14.625,
   "stage1_epochs": 3,
   "stage2_epochs": 7,
   "bridge_lr": 0.001,
   "lm_lr": 2e-05,
   "core_lr": 5e-05,
   "label_smoothing": 0.1,
   "num_evidence_tokens": 8,
-  "gen_weight": 2.0
 }

   "run_name": "phase4_smollm2",
   "phase": "4",
   "decoder": "SmolLM2-135M-Instruct",
+  "bridge": "LLaVA-1.5 MLP (768->576->576)",
   "backbone": "dinov3",
   "K": 3,
   "stage1_epochs": 3,
   "stage2_epochs": 7,
+  "total_epochs": 10,
   "bridge_lr": 0.001,
   "lm_lr": 2e-05,
   "core_lr": 5e-05,
   "label_smoothing": 0.1,
   "num_evidence_tokens": 8,
+  "gen_weight": 2.0,
+  "best_composite_score": 14.6,
+  "eval_history": [
+    {
+      "epoch": "S1E0",
+      "mc": 58.5,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 14.6
+    },
+    {
+      "epoch": "S1E1",
+      "mc": 58.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 14.5
+    },
+    {
+      "epoch": "S1E2",
+      "mc": 58.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 14.5
+    },
+    {
+      "epoch": "S2E0",
+      "mc": 58.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 14.5
+    },
+    {
+      "epoch": "S2E1",
+      "mc": 55.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 13.8
+    },
+    {
+      "epoch": "S2E2",
+      "mc": 56.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 14.0
+    },
+    {
+      "epoch": "S2E3",
+      "mc": 55.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 13.8
+    },
+    {
+      "epoch": "S2E4",
+      "mc": 55.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 13.8
+    },
+    {
+      "epoch": "S2E5",
+      "mc": 54.5,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 13.6
+    },
+    {
+      "epoch": "S2E6",
+      "mc": 55.0,
+      "docvqa_anls": 0.0,
+      "chartqa_ra": 0.0,
+      "textvqa_vqa": 0.0,
+      "composite": 13.8
+    }
+  ],
+  "gen_loss_progression": {
+    "chartqa": {
+      "S1E0_start": 4.98,
+      "S1E0_end": 3.58,
+      "S2E6_end": 3.39,
+      "improvement": "32% reduction"
+    },
+    "docvqa": {
+      "S1E0_start": 7.53,
+      "S1E0_end": 4.68,
+      "S2E6_end": 4.39,
+      "improvement": "42% reduction"
+    },
+    "textvqa": {
+      "S1E0_start": 10.33,
+      "S1E0_end": 4.39,
+      "S2E6_end": 3.77,
+      "improvement": "64% reduction"
+    }
+  },
+  "key_findings": [
+    "SmolLM2 pre-trained decoder generates real English text (vs degenerate repetition in Phase 3.x)",
+    "Gen losses dropped dramatically from Phase 3 starting points (e.g. TextVQA 110->3.77)",
+    "Predictions are diverse, topically relevant, and read OCR/visual content",
+    "BUT: outputs are too verbose (paragraphs) for exact-match VQA metrics",
+    "Formal metrics (ANLS, relaxed_accuracy, VQA accuracy) remain at 0%",
+    "The bridge successfully projects JEPA 768d -> SmolLM2 576d space",
+    "MC accuracy shows slight catastrophic forgetting: 58.5% -> 54.5% over training"
+  ],
+  "next_steps": [
+    "Phase 4.1: Constrained decoding - max 5-10 tokens, greedy, first-word extraction",
+    "Phase 4.1: Answer-focused prompt: \"Answer in 1-3 words:\" instead of generic \"Answer:\"",
+    "Phase 4.1: Post-processing pipeline to extract concise answers from verbose outputs",
+    "Phase 4.2: LoRA on SmolLM2 (instead of full fine-tune) to preserve base capabilities",
+    "Phase 4.2: Freeze JEPA completely during Stage 2 to prevent MC degradation"
+  ]
 }