euhidaman commited on
Commit
bc357ac
·
verified ·
1 Parent(s): 451b058

Update model - STAGE1 Epoch 1 | Loss: 6.5212

Browse files
README.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ tags:
6
+ - vision-language
7
+ - multimodal
8
+ - robotics
9
+ - edge-deployment
10
+ - tiny-vlm
11
+ - repvit
12
+ - tinyllm
13
+ - stage1
14
+ base_model:
15
+ - tinyllm
16
+ library_name: transformers
17
+ pipeline_tag: image-text-to-text
18
+ ---
19
+
20
+ # EmberVLM: Tiny (~35M parameters)
21
+
22
+ **🔥 Efficient Vision-Language Model for Edge Deployment & Robotic Applications**
23
+
24
+ This model is currently in training - **STAGE1 (Epoch 1)**.
25
+
26
+ ## 📊 Current Training Status
27
+
28
+ - **Stage**: Visual-Language Alignment - Learning to ground vision and language
29
+ - **Epoch**: 1
30
+ - **Last Updated**: 2026-01-28 15:03:00 UTC
31
+
32
+ ### Latest Metrics
33
+ - **captioning_loss**: 8.4406
34
+ - **contrastive_loss**: 4.6019
35
+ - **loss**: 6.5212
36
+
37
+ ## 🏗️ Model Architecture
38
+
39
+ - **Size**: Tiny (~35M parameters)
40
+ - **Total Parameters**: 37,237,665
41
+ - **Trainable Parameters**: 23,254,337 (62.4%)
42
+ - **Vision Encoder**: RepViT-M0.9 (~5M params)
43
+ - **Language Model**: TinyLLM-30M (30M params)
44
+
45
+ ## 🎯 Training Curriculum
46
+
47
+ EmberVLM follows a 4-stage training curriculum:
48
+
49
+ 1. ✅ **Stage 1: Visual-Language Alignment** - Grounding vision and language
50
+ 2. ✅ **Stage 2: Multimodal Instruction Tuning** - Following instructions
51
+ 3. ✅ **Stage 3: Robot Fleet Selection** - Task-robot matching
52
+ 4. ⏳ **Stage 4: Chain-of-Thought Reasoning** - Reasoning generation
53
+
54
+ **Current Stage**: STAGE1
55
+
56
+ ## 💻 Usage
57
+
58
+ ```python
59
+ from transformers import AutoTokenizer
60
+ from embervlm import EmberVLM
61
+ from PIL import Image
62
+
63
+ # Load model and tokenizer
64
+ model = EmberVLM.from_pretrained("euhidaman/embervlm-tiny")
65
+ tokenizer = AutoTokenizer.from_pretrained("euhidaman/embervlm-tiny")
66
+
67
+ # Load image
68
+ image = Image.open("scene.jpg")
69
+
70
+ # Generate response
71
+ prompt = "<image>Describe what you see and select the best robot for this task."
72
+ outputs = model.generate(
73
+ image=image,
74
+ prompt=prompt,
75
+ tokenizer=tokenizer,
76
+ max_new_tokens=256
77
+ )
78
+
79
+ print(outputs)
80
+ ```
81
+
82
+ ## 🎓 Training Details
83
+
84
+ - **Vision Backbone**: repvit
85
+ - **Language Backbone**: tinyllm
86
+ - **Optimization**: AdamW with cosine learning rate schedule
87
+ - **Mixed Precision**: bfloat16
88
+ - **Distributed Training**: Multi-GPU with DDP
89
+ - **Class Balancing**: Focal loss for robot selection (Stage 3)
90
+ - **Reasoning**: Chain-of-thought with reinforcement learning (Stage 4)
91
+
92
+ ## 🌍 Environmental Impact
93
+
94
+ This model is designed for edge deployment to minimize energy consumption.
95
+
96
+ ## 🎯 Intended Use
97
+
98
+ - **Primary**: Edge deployment on resource-constrained devices
99
+ - **Applications**:
100
+ - Robotic vision-language understanding
101
+ - Real-time multimodal reasoning
102
+ - Robot fleet selection and task planning
103
+ - Mobile/embedded AI systems
104
+
105
+ ## ⚠️ Limitations
106
+
107
+ - Model is still in training - performance will improve as training progresses
108
+ - Optimized for efficiency over maximum accuracy
109
+ - Best suited for edge/mobile deployment scenarios
110
+ - Training focused on robot-centric scenarios
111
+
112
+ ## 📚 Citation
113
+
114
+ ```bibtex
115
+ @software{embervlm_2026,
116
+ title = {EmberVLM: Efficient Vision-Language Model for Edge Deployment},
117
+ author = {EmberVLM Team},
118
+ year = {2026},
119
+ url = {https://huggingface.co/euhidaman/embervlm-tiny}
120
+ }
121
+ ```
122
+
123
+ ## 📝 License
124
+
125
+ Apache 2.0
126
+
127
+ ---
128
+
129
+ **Note**: This is a checkpoint from stage1 training (epoch 1).
130
+ The model will be updated after each epoch with improved performance.
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|action_plan|>": 50260,
3
+ "<|image|>": 50261,
4
+ "<|reasoning_end|>": 50258,
5
+ "<|reasoning_start|>": 50257,
6
+ "<|robot_selection|>": 50259
7
+ }
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vision_backbone": "repvit",
3
+ "language_backbone": "tinyllm",
4
+ "vision_model": "repvit_m0_9",
5
+ "vision_pretrained": true,
6
+ "freeze_vision": true,
7
+ "num_visual_tokens": 8,
8
+ "vision_output_dim": 384,
9
+ "image_size": 224,
10
+ "language_hidden_size": 384,
11
+ "language_num_layers": 6,
12
+ "language_num_heads": 6,
13
+ "language_vocab_size": 50262,
14
+ "language_max_length": 1024,
15
+ "freeze_language_base": true,
16
+ "unfreeze_last_layer": true,
17
+ "use_pretrained_language": true,
18
+ "pretrained_language_model": "tinyllm/30M-0.4",
19
+ "fusion_bottleneck_dim": 48,
20
+ "fusion_dropout": 0.1,
21
+ "use_qk_norm": true,
22
+ "reasoning_enabled": true,
23
+ "reasoning_hidden_dim": 192,
24
+ "reasoning_num_layers": 2,
25
+ "reasoning_num_heads": 4,
26
+ "num_reasoning_steps": 4,
27
+ "max_plan_steps": 5,
28
+ "num_robots": 5,
29
+ "robot_names": [
30
+ "Drone",
31
+ "Humanoid",
32
+ "Wheeled",
33
+ "Legged",
34
+ "Underwater"
35
+ ],
36
+ "special_tokens": {
37
+ "reasoning_start": "<|reasoning_start|>",
38
+ "reasoning_end": "<|reasoning_end|>",
39
+ "robot_selection": "<|robot_selection|>",
40
+ "action_plan": "<|action_plan|>",
41
+ "image_token": "<|image|>"
42
+ },
43
+ "dropout": 0.1,
44
+ "initializer_range": 0.02,
45
+ "vocab_size": 50262
46
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a0610a57d6c72d939c944e5b71106e003f0c4a3d6fc9daa5b9ac934e22922e
3
+ size 88817547
special_tokens_map.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|reasoning_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|reasoning_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|robot_selection|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|action_plan|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|image|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ ],
39
+ "bos_token": {
40
+ "content": "<|endoftext|>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "eos_token": {
47
+ "content": "<|endoftext|>",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ },
53
+ "pad_token": "<|endoftext|>",
54
+ "unk_token": {
55
+ "content": "<|endoftext|>",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false
60
+ }
61
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": "<|reasoning_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50258": {
22
+ "content": "<|reasoning_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "50259": {
30
+ "content": "<|robot_selection|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "50260": {
38
+ "content": "<|action_plan|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "50261": {
46
+ "content": "<|image|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<|reasoning_start|>",
56
+ "<|reasoning_end|>",
57
+ "<|robot_selection|>",
58
+ "<|action_plan|>",
59
+ "<|image|>"
60
+ ],
61
+ "bos_token": "<|endoftext|>",
62
+ "clean_up_tokenization_spaces": true,
63
+ "eos_token": "<|endoftext|>",
64
+ "errors": "replace",
65
+ "extra_special_tokens": {},
66
+ "model_max_length": 1024,
67
+ "pad_token": "<|endoftext|>",
68
+ "tokenizer_class": "GPT2Tokenizer",
69
+ "unk_token": "<|endoftext|>"
70
+ }
training_info.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "stage": "stage1",
3
+ "epoch": 1,
4
+ "metrics": {
5
+ "loss": 6.521240068518597,
6
+ "contrastive_loss": 4.601919858351998,
7
+ "captioning_loss": 8.440560257953146
8
+ },
9
+ "carbon_emissions_kg": 0.0,
10
+ "timestamp": "2026-01-28T15:03:00.655056",
11
+ "vision_backbone": "repvit",
12
+ "language_backbone": "tinyllm",
13
+ "total_parameters": 37237665,
14
+ "trainable_parameters": 23254337
15
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff