JorgeAV commited on
Commit
9825ad6
·
verified ·
1 Parent(s): 7fe69de

Phase 4 final results with training analysis

Browse files
Files changed (1) hide show
  1. results/phase4_smollm2.json +122 -3
results/phase4_smollm2.json CHANGED
@@ -2,16 +2,135 @@
2
  "run_name": "phase4_smollm2",
3
  "phase": "4",
4
  "decoder": "SmolLM2-135M-Instruct",
5
- "bridge": "LLaVA-1.5 MLP (768\u2192576\u2192576)",
6
  "backbone": "dinov3",
7
  "K": 3,
8
- "best_composite_score": 14.625,
9
  "stage1_epochs": 3,
10
  "stage2_epochs": 7,
 
11
  "bridge_lr": 0.001,
12
  "lm_lr": 2e-05,
13
  "core_lr": 5e-05,
14
  "label_smoothing": 0.1,
15
  "num_evidence_tokens": 8,
16
- "gen_weight": 2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
 
2
  "run_name": "phase4_smollm2",
3
  "phase": "4",
4
  "decoder": "SmolLM2-135M-Instruct",
5
+ "bridge": "LLaVA-1.5 MLP (768->576->576)",
6
  "backbone": "dinov3",
7
  "K": 3,
 
8
  "stage1_epochs": 3,
9
  "stage2_epochs": 7,
10
+ "total_epochs": 10,
11
  "bridge_lr": 0.001,
12
  "lm_lr": 2e-05,
13
  "core_lr": 5e-05,
14
  "label_smoothing": 0.1,
15
  "num_evidence_tokens": 8,
16
+ "gen_weight": 2.0,
17
+ "best_composite_score": 14.6,
18
+ "eval_history": [
19
+ {
20
+ "epoch": "S1E0",
21
+ "mc": 58.5,
22
+ "docvqa_anls": 0.0,
23
+ "chartqa_ra": 0.0,
24
+ "textvqa_vqa": 0.0,
25
+ "composite": 14.6
26
+ },
27
+ {
28
+ "epoch": "S1E1",
29
+ "mc": 58.0,
30
+ "docvqa_anls": 0.0,
31
+ "chartqa_ra": 0.0,
32
+ "textvqa_vqa": 0.0,
33
+ "composite": 14.5
34
+ },
35
+ {
36
+ "epoch": "S1E2",
37
+ "mc": 58.0,
38
+ "docvqa_anls": 0.0,
39
+ "chartqa_ra": 0.0,
40
+ "textvqa_vqa": 0.0,
41
+ "composite": 14.5
42
+ },
43
+ {
44
+ "epoch": "S2E0",
45
+ "mc": 58.0,
46
+ "docvqa_anls": 0.0,
47
+ "chartqa_ra": 0.0,
48
+ "textvqa_vqa": 0.0,
49
+ "composite": 14.5
50
+ },
51
+ {
52
+ "epoch": "S2E1",
53
+ "mc": 55.0,
54
+ "docvqa_anls": 0.0,
55
+ "chartqa_ra": 0.0,
56
+ "textvqa_vqa": 0.0,
57
+ "composite": 13.8
58
+ },
59
+ {
60
+ "epoch": "S2E2",
61
+ "mc": 56.0,
62
+ "docvqa_anls": 0.0,
63
+ "chartqa_ra": 0.0,
64
+ "textvqa_vqa": 0.0,
65
+ "composite": 14.0
66
+ },
67
+ {
68
+ "epoch": "S2E3",
69
+ "mc": 55.0,
70
+ "docvqa_anls": 0.0,
71
+ "chartqa_ra": 0.0,
72
+ "textvqa_vqa": 0.0,
73
+ "composite": 13.8
74
+ },
75
+ {
76
+ "epoch": "S2E4",
77
+ "mc": 55.0,
78
+ "docvqa_anls": 0.0,
79
+ "chartqa_ra": 0.0,
80
+ "textvqa_vqa": 0.0,
81
+ "composite": 13.8
82
+ },
83
+ {
84
+ "epoch": "S2E5",
85
+ "mc": 54.5,
86
+ "docvqa_anls": 0.0,
87
+ "chartqa_ra": 0.0,
88
+ "textvqa_vqa": 0.0,
89
+ "composite": 13.6
90
+ },
91
+ {
92
+ "epoch": "S2E6",
93
+ "mc": 55.0,
94
+ "docvqa_anls": 0.0,
95
+ "chartqa_ra": 0.0,
96
+ "textvqa_vqa": 0.0,
97
+ "composite": 13.8
98
+ }
99
+ ],
100
+ "gen_loss_progression": {
101
+ "chartqa": {
102
+ "S1E0_start": 4.98,
103
+ "S1E0_end": 3.58,
104
+ "S2E6_end": 3.39,
105
+ "improvement": "32% reduction"
106
+ },
107
+ "docvqa": {
108
+ "S1E0_start": 7.53,
109
+ "S1E0_end": 4.68,
110
+ "S2E6_end": 4.39,
111
+ "improvement": "42% reduction"
112
+ },
113
+ "textvqa": {
114
+ "S1E0_start": 10.33,
115
+ "S1E0_end": 4.39,
116
+ "S2E6_end": 3.77,
117
+ "improvement": "64% reduction"
118
+ }
119
+ },
120
+ "key_findings": [
121
+ "SmolLM2 pre-trained decoder generates real English text (vs degenerate repetition in Phase 3.x)",
122
+ "Gen losses dropped dramatically from Phase 3 starting points (e.g. TextVQA 110->3.77)",
123
+ "Predictions are diverse, topically relevant, and read OCR/visual content",
124
+ "BUT: outputs are too verbose (paragraphs) for exact-match VQA metrics",
125
+ "Formal metrics (ANLS, relaxed_accuracy, VQA accuracy) remain at 0%",
126
+ "The bridge successfully projects JEPA 768d -> SmolLM2 576d space",
127
+ "MC accuracy shows slight catastrophic forgetting: 58.5% -> 54.5% over training"
128
+ ],
129
+ "next_steps": [
130
+ "Phase 4.1: Constrained decoding - max 5-10 tokens, greedy, first-word extraction",
131
+ "Phase 4.1: Answer-focused prompt: \"Answer in 1-3 words:\" instead of generic \"Answer:\"",
132
+ "Phase 4.1: Post-processing pipeline to extract concise answers from verbose outputs",
133
+ "Phase 4.2: LoRA on SmolLM2 (instead of full fine-tune) to preserve base capabilities",
134
+ "Phase 4.2: Freeze JEPA completely during Stage 2 to prevent MC degradation"
135
+ ]
136
  }