euhidaman commited on
Commit
268efd8
·
verified ·
1 Parent(s): 999f174

Update model - STAGE2 Epoch 1 | Loss: 2.5060

Browse files
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ tags:
6
+ - vision-language
7
+ - multimodal
8
+ - robotics
9
+ - edge-deployment
10
+ - tiny-vlm
11
+ - mobilevit_xs
12
+ - smollm_135m
13
+ - stage2
14
+ base_model:
15
+ - smollm_135m
16
+ library_name: transformers
17
+ pipeline_tag: image-text-to-text
18
+ ---
19
+
20
+ # EmberVLM: Small (~137M parameters)
21
+
22
+ **🔥 Efficient Vision-Language Model for Edge Deployment & Robotic Applications**
23
+
24
+ This model is currently in training - **STAGE2 (Epoch 1)**.
25
+
26
+ ## 📊 Current Training Status
27
+
28
+ - **Stage**: Multimodal Instruction Tuning - Following complex instructions
29
+ - **Epoch**: 1
30
+ - **Last Updated**: 2026-01-25 08:39:09 UTC
31
+
32
+ ### Latest Metrics
33
+ - **instruction_loss**: 0.0000
34
+ - **loss**: 2.5060
35
+
36
+ ## 🏗️ Model Architecture
37
+
38
+ - **Size**: Small (~137M parameters)
39
+ - **Total Parameters**: 138,908,785
40
+ - **Trainable Parameters**: 34,313,153 (24.7%)
41
+ - **Vision Encoder**: Apple MobileViT-XS (~2.3M params)
42
+ - **Language Model**: SmolLM-135M (135M params)
43
+
44
+ ## 🎯 Training Curriculum
45
+
46
+ EmberVLM follows a 4-stage training curriculum:
47
+
48
+ 1. ✅ **Stage 1: Visual-Language Alignment** - Grounding vision and language
49
+ 2. ✅ **Stage 2: Multimodal Instruction Tuning** - Following instructions
50
+ 3. ✅ **Stage 3: Robot Fleet Selection** - Task-robot matching
51
+ 4. ⏳ **Stage 4: Chain-of-Thought Reasoning** - Reasoning generation
52
+
53
+ **Current Stage**: STAGE2
54
+
55
+ ## 💻 Usage
56
+
57
+ ```python
58
+ from transformers import AutoTokenizer
59
+ from embervlm import EmberVLM
60
+ from PIL import Image
61
+
62
+ # Load model and tokenizer
63
+ model = EmberVLM.from_pretrained("euhidaman/embervlm-small")
64
+ tokenizer = AutoTokenizer.from_pretrained("euhidaman/embervlm-small")
65
+
66
+ # Load image
67
+ image = Image.open("scene.jpg")
68
+
69
+ # Generate response
70
+ prompt = "<image>Describe what you see and select the best robot for this task."
71
+ outputs = model.generate(
72
+ image=image,
73
+ prompt=prompt,
74
+ tokenizer=tokenizer,
75
+ max_new_tokens=256
76
+ )
77
+
78
+ print(outputs)
79
+ ```
80
+
81
+ ## 🎓 Training Details
82
+
83
+ - **Vision Backbone**: mobilevit_xs
84
+ - **Language Backbone**: smollm_135m
85
+ - **Optimization**: AdamW with cosine learning rate schedule
86
+ - **Mixed Precision**: bfloat16
87
+ - **Distributed Training**: Multi-GPU with DDP
88
+ - **Class Balancing**: Focal loss for robot selection (Stage 3)
89
+ - **Reasoning**: Chain-of-thought with reinforcement learning (Stage 4)
90
+
91
+ ## 🌍 Environmental Impact
92
+
93
+ This model is designed for edge deployment to minimize energy consumption.
94
+
95
+ ## 🎯 Intended Use
96
+
97
+ - **Primary**: Edge deployment on resource-constrained devices
98
+ - **Applications**:
99
+ - Robotic vision-language understanding
100
+ - Real-time multimodal reasoning
101
+ - Robot fleet selection and task planning
102
+ - Mobile/embedded AI systems
103
+
104
+ ## ⚠️ Limitations
105
+
106
+ - Model is still in training - performance will improve as training progresses
107
+ - Optimized for efficiency over maximum accuracy
108
+ - Best suited for edge/mobile deployment scenarios
109
+ - Training focused on robot-centric scenarios
110
+
111
+ ## 📚 Citation
112
+
113
+ ```bibtex
114
+ @software{embervlm_2026,
115
+ title = {EmberVLM: Efficient Vision-Language Model for Edge Deployment},
116
+ author = {EmberVLM Team},
117
+ year = {2026},
118
+ url = {https://huggingface.co/euhidaman/embervlm-small}
119
+ }
120
+ ```
121
+
122
+ ## 📝 License
123
+
124
+ Apache 2.0
125
+
126
+ ---
127
+
128
+ **Note**: This is a checkpoint from stage2 training (epoch 1).
129
+ The model will be updated after each epoch with improved performance.
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|action_plan|>": 49155,
3
+ "<|image|>": 49156,
4
+ "<|reasoning_end|>": 49153,
5
+ "<|reasoning_start|>": 49152,
6
+ "<|robot_selection|>": 49154
7
+ }
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vision_backbone": "mobilevit_xs",
3
+ "language_backbone": "smollm_135m",
4
+ "vision_model": "apple/mobilevit-x-small",
5
+ "vision_pretrained": true,
6
+ "freeze_vision": true,
7
+ "num_visual_tokens": 8,
8
+ "vision_output_dim": 384,
9
+ "image_size": 224,
10
+ "language_hidden_size": 576,
11
+ "language_num_layers": 30,
12
+ "language_num_heads": 9,
13
+ "language_vocab_size": 49157,
14
+ "language_max_length": 1024,
15
+ "freeze_language_base": true,
16
+ "unfreeze_last_layer": true,
17
+ "use_pretrained_language": true,
18
+ "pretrained_language_model": "HuggingFaceTB/SmolLM-135M",
19
+ "fusion_bottleneck_dim": 48,
20
+ "fusion_dropout": 0.1,
21
+ "use_qk_norm": true,
22
+ "reasoning_enabled": true,
23
+ "reasoning_hidden_dim": 192,
24
+ "reasoning_num_layers": 2,
25
+ "reasoning_num_heads": 4,
26
+ "num_reasoning_steps": 4,
27
+ "max_plan_steps": 5,
28
+ "num_robots": 5,
29
+ "robot_names": [
30
+ "Drone",
31
+ "Humanoid",
32
+ "Wheeled",
33
+ "Legged",
34
+ "Underwater"
35
+ ],
36
+ "special_tokens": {
37
+ "reasoning_start": "<|reasoning_start|>",
38
+ "reasoning_end": "<|reasoning_end|>",
39
+ "robot_selection": "<|robot_selection|>",
40
+ "action_plan": "<|action_plan|>",
41
+ "image_token": "<|image|>"
42
+ },
43
+ "dropout": 0.1,
44
+ "initializer_range": 0.02,
45
+ "vocab_size": 49157
46
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f885364ae88a3ee563b6e89dc792ae6d5e3c64fa7e3e0e01ab932b1d9f485255
3
+ size 286953683
special_tokens_map.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|reasoning_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|reasoning_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|robot_selection|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|action_plan|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|image|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ ],
39
+ "bos_token": {
40
+ "content": "<|endoftext|>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "eos_token": {
47
+ "content": "<|endoftext|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ },
53
+ "pad_token": "<|endoftext|>",
54
+ "unk_token": {
55
+ "content": "<|endoftext|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false
60
+ }
61
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "49152": {
141
+ "content": "<|reasoning_start|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "49153": {
149
+ "content": "<|reasoning_end|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "49154": {
157
+ "content": "<|robot_selection|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "49155": {
165
+ "content": "<|action_plan|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "49156": {
173
+ "content": "<|image|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|reasoning_start|>",
183
+ "<|reasoning_end|>",
184
+ "<|robot_selection|>",
185
+ "<|action_plan|>",
186
+ "<|image|>"
187
+ ],
188
+ "bos_token": "<|endoftext|>",
189
+ "clean_up_tokenization_spaces": false,
190
+ "eos_token": "<|endoftext|>",
191
+ "extra_special_tokens": {},
192
+ "model_max_length": 1000000000000000019884624838656,
193
+ "pad_token": "<|endoftext|>",
194
+ "tokenizer_class": "GPT2Tokenizer",
195
+ "unk_token": "<|endoftext|>",
196
+ "vocab_size": 49152
197
+ }
training_info.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage2",
3
+ "epoch": 1,
4
+ "metrics": {
5
+ "loss": 2.5060146195547923,
6
+ "instruction_loss": 0.0
7
+ },
8
+ "carbon_emissions_kg": 0.0,
9
+ "timestamp": "2026-01-25T08:39:09.549990",
10
+ "vision_backbone": "mobilevit_xs",
11
+ "language_backbone": "smollm_135m",
12
+ "total_parameters": 138908785,
13
+ "trainable_parameters": 34313153
14
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff