niobures commited on
Commit
492cf17
·
verified ·
1 Parent(s): 0785453

Kosmos (code, demo, models, paper)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. KOSMOS-2.5. A Multimodal Literate Model.pdf +3 -0
  3. Kosmos-2. Grounding Multimodal Large Language Models to the World.pdf +3 -0
  4. Language Is Not All You Need. Aligning Perception with Language Models.pdf +3 -0
  5. code/Kosmos2.5 (kyegomez).zip +3 -0
  6. code/kosmos2.5-8bitq-onnx.zip +3 -0
  7. code/kosmos2.5-int8-quantization.zip +3 -0
  8. demo/kosmos-2.5-demo/.gitattributes +35 -0
  9. demo/kosmos-2.5-demo/README.md +61 -0
  10. demo/kosmos-2.5-demo/app.py +315 -0
  11. demo/kosmos-2.5-demo/requirements.txt +7 -0
  12. demo/kosmos-2.5-demo/source.txt +1 -0
  13. models/kosmos-2.5-4bit-text/.gitattributes +35 -0
  14. models/kosmos-2.5-4bit-text/README.md +199 -0
  15. models/kosmos-2.5-4bit-text/config.json +26 -0
  16. models/kosmos-2.5-4bit-text/generation_config.json +7 -0
  17. models/kosmos-2.5-4bit-text/model.safetensors +3 -0
  18. models/kosmos-2.5-4bit-text/source.txt +1 -0
  19. models/kosmos-2.5-4bit-vision/.gitattributes +35 -0
  20. models/kosmos-2.5-4bit-vision/README.md +199 -0
  21. models/kosmos-2.5-4bit-vision/config.json +22 -0
  22. models/kosmos-2.5-4bit-vision/model.safetensors +3 -0
  23. models/kosmos-2.5-4bit-vision/source.txt +1 -0
  24. models/kosmos-2.5-chat/.gitattributes +35 -0
  25. models/kosmos-2.5-chat/README.md +87 -0
  26. models/kosmos-2.5-chat/chat.py +40 -0
  27. models/kosmos-2.5-chat/config.json +163 -0
  28. models/kosmos-2.5-chat/generation_config.json +8 -0
  29. models/kosmos-2.5-chat/model-00001-of-00002.safetensors +3 -0
  30. models/kosmos-2.5-chat/model-00002-of-00002.safetensors +3 -0
  31. models/kosmos-2.5-chat/model.safetensors.index.json +621 -0
  32. models/kosmos-2.5-chat/preprocessor_config.json +5 -0
  33. models/kosmos-2.5-chat/source.txt +1 -0
  34. models/kosmos-2.5-chat/special_tokens_map.json +33 -0
  35. models/kosmos-2.5-chat/tokenizer.json +0 -0
  36. models/kosmos-2.5-chat/tokenizer_config.json +0 -0
  37. models/kosmos-2.5-ft/.gitattributes +35 -0
  38. models/kosmos-2.5-ft/README.md +111 -0
  39. models/kosmos-2.5-ft/config.json +46 -0
  40. models/kosmos-2.5-ft/generation_config.json +7 -0
  41. models/kosmos-2.5-ft/model.safetensors +3 -0
  42. models/kosmos-2.5-ft/optimizer.pt +3 -0
  43. models/kosmos-2.5-ft/rng_state.pth +3 -0
  44. models/kosmos-2.5-ft/scheduler.pt +3 -0
  45. models/kosmos-2.5-ft/source.txt +1 -0
  46. models/kosmos-2.5-ft/trainer_state.json +111 -0
  47. models/kosmos-2.5-ft/training_args.bin +3 -0
  48. models/kosmos-2.5/.gitattributes +37 -0
  49. models/kosmos-2.5/README.md +156 -0
  50. models/kosmos-2.5/ckpt.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Kosmos-2.[[:space:]]Grounding[[:space:]]Multimodal[[:space:]]Large[[:space:]]Language[[:space:]]Models[[:space:]]to[[:space:]]the[[:space:]]World.pdf filter=lfs diff=lfs merge=lfs -text
37
+ KOSMOS-2.5.[[:space:]]A[[:space:]]Multimodal[[:space:]]Literate[[:space:]]Model.pdf filter=lfs diff=lfs merge=lfs -text
38
+ Language[[:space:]]Is[[:space:]]Not[[:space:]]All[[:space:]]You[[:space:]]Need.[[:space:]]Aligning[[:space:]]Perception[[:space:]]with[[:space:]]Language[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
39
+ models/kosmos-2.5/output.png filter=lfs diff=lfs merge=lfs -text
40
+ models/kosmos-2.5/receipt_00008.png filter=lfs diff=lfs merge=lfs -text
KOSMOS-2.5. A Multimodal Literate Model.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f659ae6c3696172faf4afa57c6c6d563d9fc026ca378ceccdbfb33a5e5ee20f1
3
+ size 6426197
Kosmos-2. Grounding Multimodal Large Language Models to the World.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:064214db82bb573831fd102ace00be90147633e7f47dac491089619e03bc7e58
3
+ size 7580509
Language Is Not All You Need. Aligning Perception with Language Models.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6a848b830a5ceabaf4fd1cbd38d32572edca05d7db513398bca459f3cf6352a
3
+ size 3743156
code/Kosmos2.5 (kyegomez).zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e20f0129cf0521f1b1e2b0b8d283b734644baade3f9a58545b6201d60f31eddc
3
+ size 522240
code/kosmos2.5-8bitq-onnx.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05f788b060b553a6c05a75982c60059ea1245911d17f693caabd64d1f959618c
3
+ size 68711
code/kosmos2.5-int8-quantization.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28621b029cd25851142c1d0670cf6d0f867e3845cd71c2766acfc7996dd41a06
3
+ size 62854
demo/kosmos-2.5-demo/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
demo/kosmos-2.5-demo/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: KOSMOS-2.5 Document AI Demo
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # KOSMOS-2.5 Document AI Demo
14
+
15
+ This Space demonstrates the capabilities of Microsoft's **KOSMOS-2.5**, a multimodal literate model for machine reading of text-intensive images.
16
+
17
+ ## Features
18
+
19
+ 🔥 **Three powerful modes**:
20
+
21
+ 1. **📝 Markdown Generation**: Convert document images to clean markdown format
22
+ 2. **🔍 OCR with Bounding Boxes**: Extract text with precise spatial coordinates and visualization
23
+ 3. **💬 Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
24
+
25
+ ## What is KOSMOS-2.5?
26
+
27
+ KOSMOS-2.5 is Microsoft's latest document AI model that excels at understanding text-rich images. It can:
28
+
29
+ - Generate spatially-aware text blocks with coordinates
30
+ - Produce structured markdown output that captures document styles
31
+ - Answer questions about document content through the chat variant
32
+
33
+ The model was pre-trained on 357.4 million text-rich document images and achieves performance comparable to much larger models (1.3B vs 7B parameters) on visual question answering benchmarks.
34
+
35
+ ## Example Use Cases
36
+
37
+ - **Receipts**: Extract itemized information or ask "What's the total amount?"
38
+ - **Forms**: Convert to structured format or query specific fields
39
+ - **Articles**: Get clean markdown or ask content-specific questions
40
+ - **Screenshots**: Extract UI text or get information about elements
41
+
42
+ ## Model Information
43
+
44
+ - **Base Model**: [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5)
45
+ - **Chat Model**: [microsoft/kosmos-2.5-chat](https://huggingface.co/microsoft/kosmos-2.5-chat)
46
+ - **Paper**: [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
47
+
48
+ ## Note
49
+
50
+ This is a generative model and may occasionally produce inaccurate results. Please verify outputs for critical applications.
51
+
52
+ ## Citation
53
+
54
+ ```bibtex
55
+ @article{lv2023kosmos,
56
+ title={Kosmos-2.5: A multimodal literate model},
57
+ author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
58
+ journal={arXiv preprint arXiv:2309.11419},
59
+ year={2023}
60
+ }
61
+ ```
demo/kosmos-2.5-demo/app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import gradio as gr
4
+ from PIL import Image
5
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
+ import re
7
+
8
+ # Check if CUDA is available
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
11
+
12
+ # Check if Flash Attention 2 is available
13
+ def is_flash_attention_available():
14
+ try:
15
+ import flash_attn
16
+ return True
17
+ except ImportError:
18
+ return False
19
+
20
+ # Initialize models and processors lazily
21
+ base_model = None
22
+ base_processor = None
23
+ chat_model = None
24
+ chat_processor = None
25
+
26
+ def load_base_model():
27
+ global base_model, base_processor
28
+ if base_model is None:
29
+ base_repo = "microsoft/kosmos-2.5"
30
+
31
+ # Use Flash Attention 2 if available, otherwise use default attention
32
+ model_kwargs = {
33
+ "device_map": "cuda",
34
+ "dtype": dtype,
35
+ }
36
+ if is_flash_attention_available():
37
+ model_kwargs["attn_implementation"] = "flash_attention_2"
38
+
39
+ base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
40
+ base_repo,
41
+ **model_kwargs
42
+ )
43
+ base_processor = AutoProcessor.from_pretrained(base_repo)
44
+ return base_model, base_processor
45
+
46
+ def load_chat_model():
47
+ global chat_model, chat_processor
48
+ if chat_model is None:
49
+ chat_repo = "microsoft/kosmos-2.5-chat"
50
+
51
+ # Use Flash Attention 2 if available, otherwise use default attention
52
+ model_kwargs = {
53
+ "device_map": "cuda",
54
+ "dtype": dtype,
55
+ }
56
+ if is_flash_attention_available():
57
+ model_kwargs["attn_implementation"] = "flash_attention_2"
58
+
59
+ chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
60
+ chat_repo,
61
+ **model_kwargs
62
+ )
63
+ chat_processor = AutoProcessor.from_pretrained(chat_repo)
64
+ return chat_model, chat_processor
65
+
66
+ def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
67
+ y = y.replace(prompt, "")
68
+ if "<md>" in prompt:
69
+ return y
70
+
71
+ pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
72
+ bboxs_raw = re.findall(pattern, y)
73
+ lines = re.split(pattern, y)[1:]
74
+ bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
75
+ bboxs = [[int(j) for j in i] for i in bboxs]
76
+
77
+ info = ""
78
+ for i in range(len(lines)):
79
+ if i < len(bboxs):
80
+ box = bboxs[i]
81
+ x0, y0, x1, y1 = box
82
+ if not (x0 >= x1 or y0 >= y1):
83
+ x0 = int(x0 * scale_width)
84
+ y0 = int(y0 * scale_height)
85
+ x1 = int(x1 * scale_width)
86
+ y1 = int(y1 * scale_height)
87
+ info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}\n"
88
+ return info.strip()
89
+
90
+ @spaces.GPU(duration=120)
91
+ def generate_markdown(image):
92
+ if image is None:
93
+ return "Please upload an image."
94
+
95
+ model, processor = load_base_model()
96
+
97
+ prompt = "<md>"
98
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
99
+
100
+ height, width = inputs.pop("height"), inputs.pop("width")
101
+ raw_width, raw_height = image.size
102
+ scale_height = raw_height / height
103
+ scale_width = raw_width / width
104
+
105
+ inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
106
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
107
+
108
+ with torch.no_grad():
109
+ generated_ids = model.generate(
110
+ **inputs,
111
+ max_new_tokens=1024,
112
+ )
113
+
114
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
115
+ result = generated_text[0].replace(prompt, "").strip()
116
+
117
+ return result
118
+
119
+ @spaces.GPU(duration=120)
120
+ def generate_ocr(image):
121
+ if image is None:
122
+ return "Please upload an image.", None
123
+
124
+ model, processor = load_base_model()
125
+
126
+ prompt = "<ocr>"
127
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
128
+
129
+ height, width = inputs.pop("height"), inputs.pop("width")
130
+ raw_width, raw_height = image.size
131
+ scale_height = raw_height / height
132
+ scale_width = raw_width / width
133
+
134
+ inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
135
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
136
+
137
+ with torch.no_grad():
138
+ generated_ids = model.generate(
139
+ **inputs,
140
+ max_new_tokens=1024,
141
+ )
142
+
143
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
144
+
145
+ # Post-process OCR output
146
+ output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
147
+
148
+ # Create visualization
149
+ from PIL import ImageDraw
150
+ vis_image = image.copy()
151
+ draw = ImageDraw.Draw(vis_image)
152
+
153
+ lines = output_text.split("\n")
154
+ for line in lines:
155
+ if not line.strip():
156
+ continue
157
+ parts = line.split(",")
158
+ if len(parts) >= 8:
159
+ try:
160
+ coords = list(map(int, parts[:8]))
161
+ draw.polygon(coords, outline="red", width=2)
162
+ except:
163
+ continue
164
+
165
+ return output_text, vis_image
166
+
167
+ @spaces.GPU(duration=120)
168
+ def generate_chat_response(image, question):
169
+ if image is None:
170
+ return "Please upload an image."
171
+ if not question.strip():
172
+ return "Please ask a question."
173
+
174
+ model, processor = load_chat_model()
175
+
176
+ template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
177
+ prompt = template.format(question)
178
+
179
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
180
+
181
+ height, width = inputs.pop("height"), inputs.pop("width")
182
+ raw_width, raw_height = image.size
183
+ scale_height = raw_height / height
184
+ scale_width = raw_width / width
185
+
186
+ inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
187
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
188
+
189
+ with torch.no_grad():
190
+ generated_ids = model.generate(
191
+ **inputs,
192
+ max_new_tokens=1024,
193
+ )
194
+
195
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
196
+
197
+ # Extract only the assistant's response
198
+ result = generated_text[0]
199
+ if "ASSISTANT:" in result:
200
+ result = result.split("ASSISTANT:")[-1].strip()
201
+
202
+ return result
203
+
204
+ # Create Gradio interface
205
+ with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as demo:
206
+ gr.Markdown("""
207
+ # KOSMOS-2.5 Document AI Demo
208
+
209
+ Explore Microsoft's KOSMOS-2.5, a multimodal model for reading text-intensive images!
210
+ This demo showcases three capabilities:
211
+
212
+ 1. **Markdown Generation**: Convert document images to markdown format
213
+ 2. **OCR with Bounding Boxes**: Extract text with spatial coordinates
214
+ 3. **Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
215
+
216
+ Upload a document image (receipt, form, article, etc.) and try different tasks!
217
+ """)
218
+
219
+ with gr.Tabs():
220
+ # Markdown Generation Tab
221
+ with gr.TabItem("📝 Markdown Generation"):
222
+ with gr.Row():
223
+ with gr.Column():
224
+ md_image = gr.Image(type="pil", label="Upload Document Image")
225
+ gr.Examples(
226
+ examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
227
+ inputs=md_image
228
+ )
229
+ md_button = gr.Button("Generate Markdown", variant="primary")
230
+ with gr.Column():
231
+ md_output = gr.Textbox(
232
+ label="Generated Markdown",
233
+ lines=15,
234
+ max_lines=20,
235
+ show_copy_button=True
236
+ )
237
+
238
+ # OCR Tab
239
+ with gr.TabItem("🔍 OCR with Bounding Boxes"):
240
+ with gr.Row():
241
+ with gr.Column():
242
+ ocr_image = gr.Image(type="pil", label="Upload Document Image")
243
+ gr.Examples(
244
+ examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
245
+ inputs=ocr_image
246
+ )
247
+ ocr_button = gr.Button("Extract Text with Coordinates", variant="primary")
248
+ with gr.Column():
249
+ with gr.Row():
250
+ ocr_text = gr.Textbox(
251
+ label="Extracted Text with Coordinates",
252
+ lines=10,
253
+ show_copy_button=True
254
+ )
255
+ ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")
256
+
257
+ # Chat Tab
258
+ with gr.TabItem("💬 Document Q&A (Chat)"):
259
+ with gr.Row():
260
+ with gr.Column():
261
+ chat_image = gr.Image(type="pil", label="Upload Document Image")
262
+ gr.Examples(
263
+ examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
264
+ inputs=chat_image
265
+ )
266
+ chat_question = gr.Textbox(
267
+ label="Ask a question about the document",
268
+ placeholder="e.g., What is the total amount on this receipt?",
269
+ lines=2
270
+ )
271
+ gr.Examples(
272
+ examples=["What is the total amount on this receipt?", "What items were purchased?", "When was this receipt issued?", "What is the subtotal?"],
273
+ inputs=chat_question
274
+ )
275
+ chat_button = gr.Button("Get Answer", variant="primary")
276
+ with gr.Column():
277
+ chat_output = gr.Textbox(
278
+ label="Answer",
279
+ lines=8,
280
+ show_copy_button=True
281
+ )
282
+
283
+ # Event handlers
284
+ md_button.click(
285
+ fn=generate_markdown,
286
+ inputs=[md_image],
287
+ outputs=[md_output]
288
+ )
289
+
290
+ ocr_button.click(
291
+ fn=generate_ocr,
292
+ inputs=[ocr_image],
293
+ outputs=[ocr_text, ocr_vis]
294
+ )
295
+
296
+ chat_button.click(
297
+ fn=generate_chat_response,
298
+ inputs=[chat_image, chat_question],
299
+ outputs=[chat_output]
300
+ )
301
+
302
+ # Examples section
303
+ gr.Markdown("""
304
+ ## Example Use Cases:
305
+ - **Receipts**: Extract itemized information or ask about totals
306
+ - **Forms**: Convert to structured format or answer specific questions
307
+ - **Articles**: Get markdown format or ask about content
308
+ - **Screenshots**: Extract text or get information about specific elements
309
+
310
+ ## Note:
311
+ This is a generative model and may occasionally hallucinate. Results should be verified for accuracy.
312
+ """)
313
+
314
+ if __name__ == "__main__":
315
+ demo.launch()
demo/kosmos-2.5-demo/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ torch>=2.0.0
3
+ git+https://github.com/huggingface/transformers.git
4
+ accelerate
5
+ pillow
6
+ requests
7
+ spaces
demo/kosmos-2.5-demo/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/spaces/nielsr/kosmos-2.5-demo
models/kosmos-2.5-4bit-text/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/kosmos-2.5-4bit-text/README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
models/kosmos-2.5-4bit-text/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "architectures": [
5
+ "Kosmos2_5TextForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_heads": 16,
9
+ "bos_token_id": 0,
10
+ "dropout": 0,
11
+ "embed_dim": 1536,
12
+ "eos_token_id": 2,
13
+ "ffn_dim": 6144,
14
+ "init_std": 0.02,
15
+ "layer_norm_eps": 1e-05,
16
+ "layerdrop": 0.0,
17
+ "layers": 24,
18
+ "max_position_embeddings": 4096,
19
+ "model_type": "kosmos_2_5_text_model",
20
+ "pad_token_id": 1,
21
+ "scale_embedding": true,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.47.0.dev0",
24
+ "use_cache": true,
25
+ "vocab_size": 108481
26
+ }
models/kosmos-2.5-4bit-text/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.47.0.dev0"
7
+ }
models/kosmos-2.5-4bit-text/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487cc16ca412b5b3b4ed471a4f4b1f4a203cceaacb40a071bc8bbae794c38c84
3
+ size 717140144
models/kosmos-2.5-4bit-text/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Fireblossom/kosmos-2.5-4bit-text
models/kosmos-2.5-4bit-vision/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/kosmos-2.5-4bit-vision/README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
models/kosmos-2.5-4bit-vision/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Kosmos2_5VisionModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "d_ff": 3968,
7
+ "d_kv": 64,
8
+ "dense_act_fn": "gelu_new",
9
+ "dropout_rate": 0.0,
10
+ "hidden_size": 1536,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 1e-10,
13
+ "layer_norm_eps": 1e-06,
14
+ "max_length": 4096,
15
+ "model_type": "kosmos_2_5_vision_model",
16
+ "num_attention_heads": 24,
17
+ "num_hidden_layers": 18,
18
+ "patch_embed_hidden_size": 768,
19
+ "seq_len": 4096,
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.47.0.dev0"
22
+ }
models/kosmos-2.5-4bit-vision/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab95bc9ba21aa3f673e447a4bb6ffbb694c38baf442d05a93bef748f11d8d1c7
3
+ size 306710574
models/kosmos-2.5-4bit-vision/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Fireblossom/kosmos-2.5-4bit-vision
models/kosmos-2.5-chat/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/kosmos-2.5-chat/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ library_name: transformers
5
+ pipeline_tag: image-text-to-text
6
+ ---
7
+ # Kosmos-2.5-chat
8
+
9
+ [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
10
+
11
+ ## Model description
12
+ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
13
+
14
+ Kosmos-2.5-chat is a model specifically trained for Visual Question Answering (VQA) tasks, based on further training of Kosmos-2.5. For more details about Kosmos-2.5-chat, please refer to the paper.
15
+
16
+ [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
17
+
18
+ ## Usage
19
+
20
+ KOSMOS-2.5 is supported from Transformers >= 4.56. Find the docs [here](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5).
21
+
22
+ ```python
23
+ import re
24
+ import torch
25
+ import requests
26
+ from PIL import Image, ImageDraw
27
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
28
+
29
+ repo = "microsoft/kosmos-2.5-chat"
30
+ device = "cuda:0"
31
+ dtype = torch.bfloat16
32
+
33
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo,
34
+ device_map=device,
35
+ torch_dtype=dtype,
36
+ attn_implementation="flash_attention_2")
37
+ processor = AutoProcessor.from_pretrained(repo)
38
+
39
+ # sample image
40
+ url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
41
+
42
+ image = Image.open(requests.get(url, stream=True).raw)
43
+
44
+ question = "What is the sub total of the receipt?"
45
+ template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
46
+ prompt = template.format(question)
47
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
48
+
49
+ height, width = inputs.pop("height"), inputs.pop("width")
50
+ raw_width, raw_height = image.size
51
+ scale_height = raw_height / height
52
+ scale_width = raw_width / width
53
+
54
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
55
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
56
+ generated_ids = model.generate(
57
+ **inputs,
58
+ max_new_tokens=1024,
59
+ )
60
+
61
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
62
+ print(generated_text[0])
63
+ ```
64
+
65
+ ## NOTE:
66
+ Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all results in the images.
67
+
68
+ ## Inference
69
+ **Document Understanding Task:** For usage instructions, please refer to [chat.py](chat.py).
70
+
71
+ ## Citation
72
+
73
+ If you find Kosmos-2.5-chat useful in your research, please cite the following paper:
74
+
75
+ ```
76
+ @article{lv2023kosmos,
77
+ title={Kosmos-2.5: A multimodal literate model},
78
+ author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
79
+ journal={arXiv preprint arXiv:2309.11419},
80
+ year={2023}
81
+ }
82
+ ```
83
+
84
+ ## License
85
+ The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
86
+
87
+ [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
models/kosmos-2.5-chat/chat.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import requests
4
+ from PIL import Image, ImageDraw
5
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
6
+
7
+ repo = "microsoft/kosmos-2.5-chat"
8
+ device = "cuda:0"
9
+ dtype = torch.bfloat16
10
+
11
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo,
12
+ device_map=device,
13
+ torch_dtype=dtype,
14
+ attn_implementation="flash_attention_2")
15
+ processor = AutoProcessor.from_pretrained(repo)
16
+
17
+ # sample image
18
+ url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
19
+
20
+ image = Image.open(requests.get(url, stream=True).raw)
21
+
22
+ question = "What is the sub total of the receipt?"
23
+ template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
24
+ prompt = template.format(question)
25
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
26
+
27
+ height, width = inputs.pop("height"), inputs.pop("width")
28
+ raw_width, raw_height = image.size
29
+ scale_height = raw_height / height
30
+ scale_width = raw_width / width
31
+
32
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
33
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
34
+ generated_ids = model.generate(
35
+ **inputs,
36
+ max_new_tokens=1024,
37
+ )
38
+
39
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
40
+ print(generated_text[0])
models/kosmos-2.5-chat/config.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Kosmos2_5ForConditionalGeneration"
4
+ ],
5
+ "latent_query_num": 2048,
6
+ "model_type": "kosmos-2.5",
7
+ "text_config": {
8
+ "_name_or_path": "",
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "gelu",
11
+ "add_cross_attention": false,
12
+ "architectures": null,
13
+ "attention_dropout": 0.0,
14
+ "attention_heads": 16,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "cross_attention_hidden_size": null,
20
+ "decoder_start_token_id": null,
21
+ "diversity_penalty": 0.0,
22
+ "do_sample": false,
23
+ "dropout": 0.1,
24
+ "early_stopping": false,
25
+ "embed_dim": 1536,
26
+ "encoder_no_repeat_ngram_size": 0,
27
+ "eos_token_id": 2,
28
+ "exponential_decay_length_penalty": null,
29
+ "ffn_dim": 6144,
30
+ "finetuning_task": null,
31
+ "forced_bos_token_id": null,
32
+ "forced_eos_token_id": null,
33
+ "id2label": {
34
+ "0": "LABEL_0",
35
+ "1": "LABEL_1"
36
+ },
37
+ "init_std": 0.02,
38
+ "is_decoder": false,
39
+ "is_encoder_decoder": false,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1
43
+ },
44
+ "layer_norm_eps": 1e-05,
45
+ "layerdrop": 0.0,
46
+ "length_penalty": 1.0,
47
+ "layers": 24,
48
+ "max_length": 20,
49
+ "max_position_embeddings": 4096,
50
+ "min_length": 0,
51
+ "model_type": "kosmos_2_5_text_model",
52
+ "no_repeat_ngram_size": 3,
53
+ "num_beam_groups": 1,
54
+ "num_beams": 1,
55
+ "num_return_sequences": 1,
56
+ "output_attentions": false,
57
+ "output_hidden_states": false,
58
+ "output_scores": false,
59
+ "pad_token_id": 1,
60
+ "prefix": null,
61
+ "problem_type": null,
62
+ "pruned_heads": {},
63
+ "remove_invalid_values": false,
64
+ "repetition_penalty": 1.0,
65
+ "return_dict": true,
66
+ "return_dict_in_generate": false,
67
+ "scale_embedding": true,
68
+ "sep_token_id": null,
69
+ "suppress_tokens": null,
70
+ "task_specific_params": null,
71
+ "temperature": 1.0,
72
+ "tf_legacy_loss": false,
73
+ "tie_encoder_decoder": false,
74
+ "tie_word_embeddings": true,
75
+ "tokenizer_class": null,
76
+ "top_k": 50,
77
+ "top_p": 1.0,
78
+ "torch_dtype": null,
79
+ "torchscript": false,
80
+ "typical_p": 1.0,
81
+ "use_bfloat16": false,
82
+ "use_cache": true,
83
+ "vocab_size": 108481
84
+ },
85
+ "torch_dtype": "float32",
86
+ "transformers_version": "4.43.3",
87
+ "vision_config": {
88
+ "_name_or_path": "",
89
+ "add_cross_attention": false,
90
+ "architectures": null,
91
+ "attention_dropout": 0.0,
92
+ "bad_words_ids": null,
93
+ "begin_suppress_tokens": null,
94
+ "bos_token_id": null,
95
+ "chunk_size_feed_forward": 0,
96
+ "cross_attention_hidden_size": null,
97
+ "intermediate_size": 3968,
98
+ "head_dim": 64,
99
+ "decoder_start_token_id": null,
100
+ "dense_act_fn": "gelu_new",
101
+ "diversity_penalty": 0.0,
102
+ "do_sample": false,
103
+ "dropout_rate": 0.0,
104
+ "early_stopping": false,
105
+ "encoder_no_repeat_ngram_size": 0,
106
+ "eos_token_id": null,
107
+ "exponential_decay_length_penalty": null,
108
+ "finetuning_task": null,
109
+ "forced_bos_token_id": null,
110
+ "forced_eos_token_id": null,
111
+ "hidden_size": 1536,
112
+ "id2label": {
113
+ "0": "LABEL_0",
114
+ "1": "LABEL_1"
115
+ },
116
+ "initializer_factor": 1.0,
117
+ "initializer_range": 1e-10,
118
+ "is_decoder": false,
119
+ "is_encoder_decoder": false,
120
+ "label2id": {
121
+ "LABEL_0": 0,
122
+ "LABEL_1": 1
123
+ },
124
+ "layer_norm_eps": 1e-06,
125
+ "length_penalty": 1.0,
126
+ "max_length": 4096,
127
+ "min_length": 0,
128
+ "model_type": "kosmos_2_5_vision_model",
129
+ "no_repeat_ngram_size": 0,
130
+ "num_attention_heads": 24,
131
+ "num_beam_groups": 1,
132
+ "num_beams": 1,
133
+ "num_hidden_layers": 18,
134
+ "num_return_sequences": 1,
135
+ "output_attentions": false,
136
+ "output_hidden_states": false,
137
+ "output_scores": false,
138
+ "pad_token_id": null,
139
+ "patch_embed_hidden_size": 768,
140
+ "prefix": null,
141
+ "problem_type": null,
142
+ "pruned_heads": {},
143
+ "remove_invalid_values": false,
144
+ "repetition_penalty": 1.0,
145
+ "return_dict": true,
146
+ "return_dict_in_generate": false,
147
+ "sep_token_id": null,
148
+ "max_num_patches": 4096,
149
+ "suppress_tokens": null,
150
+ "task_specific_params": null,
151
+ "temperature": 1.0,
152
+ "tf_legacy_loss": false,
153
+ "tie_encoder_decoder": false,
154
+ "tie_word_embeddings": true,
155
+ "tokenizer_class": null,
156
+ "top_k": 50,
157
+ "top_p": 1.0,
158
+ "torch_dtype": null,
159
+ "torchscript": false,
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false
162
+ }
163
+ }
models/kosmos-2.5-chat/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "no_repeat_ngram_size": 3,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.43.3"
8
+ }
models/kosmos-2.5-chat/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea32f178375c21412ee2829b2389682544ecd6f990a6120b219e065b1500d085
3
+ size 4995252144
models/kosmos-2.5-chat/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b642fe2ce0ad3ff30838a3daebb6d26252770c65d29306a809e05598fc0e393
3
+ size 503408384
models/kosmos-2.5-chat/model.safetensors.index.json ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5498585088
4
+ },
5
+ "weight_map": {
6
+ "image_to_text_projection.dense.bias": "model-00002-of-00002.safetensors",
7
+ "image_to_text_projection.dense.weight": "model-00002-of-00002.safetensors",
8
+ "image_to_text_projection.latent_query": "model-00002-of-00002.safetensors",
9
+ "image_to_text_projection.x_attn.k_proj.bias": "model-00002-of-00002.safetensors",
10
+ "image_to_text_projection.x_attn.k_proj.weight": "model-00002-of-00002.safetensors",
11
+ "image_to_text_projection.x_attn.out_proj.bias": "model-00002-of-00002.safetensors",
12
+ "image_to_text_projection.x_attn.out_proj.weight": "model-00002-of-00002.safetensors",
13
+ "image_to_text_projection.x_attn.q_proj.bias": "model-00002-of-00002.safetensors",
14
+ "image_to_text_projection.x_attn.q_proj.weight": "model-00002-of-00002.safetensors",
15
+ "image_to_text_projection.x_attn.v_proj.bias": "model-00002-of-00002.safetensors",
16
+ "image_to_text_projection.x_attn.v_proj.weight": "model-00002-of-00002.safetensors",
17
+ "text_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
18
+ "text_model.model.layer_norm.bias": "model-00001-of-00002.safetensors",
19
+ "text_model.model.layer_norm.weight": "model-00001-of-00002.safetensors",
20
+ "text_model.model.layers.0.ffn.fc1.bias": "model-00001-of-00002.safetensors",
21
+ "text_model.model.layers.0.ffn.fc1.weight": "model-00001-of-00002.safetensors",
22
+ "text_model.model.layers.0.ffn.fc2.bias": "model-00001-of-00002.safetensors",
23
+ "text_model.model.layers.0.ffn.fc2.weight": "model-00001-of-00002.safetensors",
24
+ "text_model.model.layers.0.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
25
+ "text_model.model.layers.0.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
26
+ "text_model.model.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
27
+ "text_model.model.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
28
+ "text_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
29
+ "text_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
30
+ "text_model.model.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
31
+ "text_model.model.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
32
+ "text_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
33
+ "text_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "text_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
35
+ "text_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
36
+ "text_model.model.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
37
+ "text_model.model.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
38
+ "text_model.model.layers.1.ffn.fc1.bias": "model-00001-of-00002.safetensors",
39
+ "text_model.model.layers.1.ffn.fc1.weight": "model-00001-of-00002.safetensors",
40
+ "text_model.model.layers.1.ffn.fc2.bias": "model-00001-of-00002.safetensors",
41
+ "text_model.model.layers.1.ffn.fc2.weight": "model-00001-of-00002.safetensors",
42
+ "text_model.model.layers.1.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
43
+ "text_model.model.layers.1.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
44
+ "text_model.model.layers.1.final_layer_norm.bias": "model-00001-of-00002.safetensors",
45
+ "text_model.model.layers.1.final_layer_norm.weight": "model-00001-of-00002.safetensors",
46
+ "text_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
47
+ "text_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "text_model.model.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
49
+ "text_model.model.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
50
+ "text_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
51
+ "text_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "text_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
53
+ "text_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
54
+ "text_model.model.layers.1.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
55
+ "text_model.model.layers.1.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
56
+ "text_model.model.layers.10.ffn.fc1.bias": "model-00001-of-00002.safetensors",
57
+ "text_model.model.layers.10.ffn.fc1.weight": "model-00001-of-00002.safetensors",
58
+ "text_model.model.layers.10.ffn.fc2.bias": "model-00001-of-00002.safetensors",
59
+ "text_model.model.layers.10.ffn.fc2.weight": "model-00001-of-00002.safetensors",
60
+ "text_model.model.layers.10.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
61
+ "text_model.model.layers.10.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
62
+ "text_model.model.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
63
+ "text_model.model.layers.10.final_layer_norm.weight": "model-00001-of-00002.safetensors",
64
+ "text_model.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
65
+ "text_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
66
+ "text_model.model.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
67
+ "text_model.model.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
68
+ "text_model.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
69
+ "text_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
+ "text_model.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
71
+ "text_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
72
+ "text_model.model.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
73
+ "text_model.model.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
74
+ "text_model.model.layers.11.ffn.fc1.bias": "model-00001-of-00002.safetensors",
75
+ "text_model.model.layers.11.ffn.fc1.weight": "model-00001-of-00002.safetensors",
76
+ "text_model.model.layers.11.ffn.fc2.bias": "model-00001-of-00002.safetensors",
77
+ "text_model.model.layers.11.ffn.fc2.weight": "model-00001-of-00002.safetensors",
78
+ "text_model.model.layers.11.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
79
+ "text_model.model.layers.11.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "text_model.model.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
81
+ "text_model.model.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
82
+ "text_model.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
83
+ "text_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
84
+ "text_model.model.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
85
+ "text_model.model.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
86
+ "text_model.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
87
+ "text_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
+ "text_model.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
89
+ "text_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
90
+ "text_model.model.layers.11.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
91
+ "text_model.model.layers.11.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
92
+ "text_model.model.layers.12.ffn.fc1.bias": "model-00001-of-00002.safetensors",
93
+ "text_model.model.layers.12.ffn.fc1.weight": "model-00001-of-00002.safetensors",
94
+ "text_model.model.layers.12.ffn.fc2.bias": "model-00001-of-00002.safetensors",
95
+ "text_model.model.layers.12.ffn.fc2.weight": "model-00001-of-00002.safetensors",
96
+ "text_model.model.layers.12.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
97
+ "text_model.model.layers.12.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
98
+ "text_model.model.layers.12.final_layer_norm.bias": "model-00001-of-00002.safetensors",
99
+ "text_model.model.layers.12.final_layer_norm.weight": "model-00001-of-00002.safetensors",
100
+ "text_model.model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
101
+ "text_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
102
+ "text_model.model.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
103
+ "text_model.model.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
104
+ "text_model.model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
105
+ "text_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "text_model.model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
107
+ "text_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
108
+ "text_model.model.layers.12.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
109
+ "text_model.model.layers.12.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
110
+ "text_model.model.layers.13.ffn.fc1.bias": "model-00001-of-00002.safetensors",
111
+ "text_model.model.layers.13.ffn.fc1.weight": "model-00001-of-00002.safetensors",
112
+ "text_model.model.layers.13.ffn.fc2.bias": "model-00001-of-00002.safetensors",
113
+ "text_model.model.layers.13.ffn.fc2.weight": "model-00001-of-00002.safetensors",
114
+ "text_model.model.layers.13.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
115
+ "text_model.model.layers.13.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
116
+ "text_model.model.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
117
+ "text_model.model.layers.13.final_layer_norm.weight": "model-00001-of-00002.safetensors",
118
+ "text_model.model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
119
+ "text_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
120
+ "text_model.model.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
121
+ "text_model.model.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
122
+ "text_model.model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
123
+ "text_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
+ "text_model.model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
125
+ "text_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
126
+ "text_model.model.layers.13.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
127
+ "text_model.model.layers.13.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
128
+ "text_model.model.layers.14.ffn.fc1.bias": "model-00001-of-00002.safetensors",
129
+ "text_model.model.layers.14.ffn.fc1.weight": "model-00001-of-00002.safetensors",
130
+ "text_model.model.layers.14.ffn.fc2.bias": "model-00001-of-00002.safetensors",
131
+ "text_model.model.layers.14.ffn.fc2.weight": "model-00001-of-00002.safetensors",
132
+ "text_model.model.layers.14.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
133
+ "text_model.model.layers.14.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "text_model.model.layers.14.final_layer_norm.bias": "model-00001-of-00002.safetensors",
135
+ "text_model.model.layers.14.final_layer_norm.weight": "model-00001-of-00002.safetensors",
136
+ "text_model.model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
137
+ "text_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
138
+ "text_model.model.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
139
+ "text_model.model.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
140
+ "text_model.model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
141
+ "text_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
+ "text_model.model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
143
+ "text_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
144
+ "text_model.model.layers.14.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
145
+ "text_model.model.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
146
+ "text_model.model.layers.15.ffn.fc1.bias": "model-00001-of-00002.safetensors",
147
+ "text_model.model.layers.15.ffn.fc1.weight": "model-00001-of-00002.safetensors",
148
+ "text_model.model.layers.15.ffn.fc2.bias": "model-00001-of-00002.safetensors",
149
+ "text_model.model.layers.15.ffn.fc2.weight": "model-00001-of-00002.safetensors",
150
+ "text_model.model.layers.15.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
151
+ "text_model.model.layers.15.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
152
+ "text_model.model.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
153
+ "text_model.model.layers.15.final_layer_norm.weight": "model-00001-of-00002.safetensors",
154
+ "text_model.model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
155
+ "text_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
156
+ "text_model.model.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
157
+ "text_model.model.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
158
+ "text_model.model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
159
+ "text_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
+ "text_model.model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
161
+ "text_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "text_model.model.layers.15.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
163
+ "text_model.model.layers.15.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
164
+ "text_model.model.layers.16.ffn.fc1.bias": "model-00001-of-00002.safetensors",
165
+ "text_model.model.layers.16.ffn.fc1.weight": "model-00001-of-00002.safetensors",
166
+ "text_model.model.layers.16.ffn.fc2.bias": "model-00001-of-00002.safetensors",
167
+ "text_model.model.layers.16.ffn.fc2.weight": "model-00001-of-00002.safetensors",
168
+ "text_model.model.layers.16.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
169
+ "text_model.model.layers.16.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
170
+ "text_model.model.layers.16.final_layer_norm.bias": "model-00001-of-00002.safetensors",
171
+ "text_model.model.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
172
+ "text_model.model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
173
+ "text_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
174
+ "text_model.model.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
175
+ "text_model.model.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
176
+ "text_model.model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
177
+ "text_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
+ "text_model.model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
179
+ "text_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
180
+ "text_model.model.layers.16.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
181
+ "text_model.model.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
182
+ "text_model.model.layers.17.ffn.fc1.bias": "model-00001-of-00002.safetensors",
183
+ "text_model.model.layers.17.ffn.fc1.weight": "model-00001-of-00002.safetensors",
184
+ "text_model.model.layers.17.ffn.fc2.bias": "model-00001-of-00002.safetensors",
185
+ "text_model.model.layers.17.ffn.fc2.weight": "model-00001-of-00002.safetensors",
186
+ "text_model.model.layers.17.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
187
+ "text_model.model.layers.17.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
188
+ "text_model.model.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
189
+ "text_model.model.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
190
+ "text_model.model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
191
+ "text_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
192
+ "text_model.model.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
193
+ "text_model.model.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
194
+ "text_model.model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
195
+ "text_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
196
+ "text_model.model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
197
+ "text_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
198
+ "text_model.model.layers.17.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
199
+ "text_model.model.layers.17.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
200
+ "text_model.model.layers.18.ffn.fc1.bias": "model-00001-of-00002.safetensors",
201
+ "text_model.model.layers.18.ffn.fc1.weight": "model-00001-of-00002.safetensors",
202
+ "text_model.model.layers.18.ffn.fc2.bias": "model-00001-of-00002.safetensors",
203
+ "text_model.model.layers.18.ffn.fc2.weight": "model-00001-of-00002.safetensors",
204
+ "text_model.model.layers.18.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
205
+ "text_model.model.layers.18.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
206
+ "text_model.model.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
207
+ "text_model.model.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
208
+ "text_model.model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
209
+ "text_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
210
+ "text_model.model.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
211
+ "text_model.model.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
212
+ "text_model.model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
213
+ "text_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
214
+ "text_model.model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
215
+ "text_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
216
+ "text_model.model.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
217
+ "text_model.model.layers.18.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
218
+ "text_model.model.layers.19.ffn.fc1.bias": "model-00001-of-00002.safetensors",
219
+ "text_model.model.layers.19.ffn.fc1.weight": "model-00001-of-00002.safetensors",
220
+ "text_model.model.layers.19.ffn.fc2.bias": "model-00001-of-00002.safetensors",
221
+ "text_model.model.layers.19.ffn.fc2.weight": "model-00001-of-00002.safetensors",
222
+ "text_model.model.layers.19.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
223
+ "text_model.model.layers.19.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
224
+ "text_model.model.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
225
+ "text_model.model.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
226
+ "text_model.model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
227
+ "text_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
228
+ "text_model.model.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
229
+ "text_model.model.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
230
+ "text_model.model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
231
+ "text_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
232
+ "text_model.model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
233
+ "text_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
234
+ "text_model.model.layers.19.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
235
+ "text_model.model.layers.19.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
236
+ "text_model.model.layers.2.ffn.fc1.bias": "model-00001-of-00002.safetensors",
237
+ "text_model.model.layers.2.ffn.fc1.weight": "model-00001-of-00002.safetensors",
238
+ "text_model.model.layers.2.ffn.fc2.bias": "model-00001-of-00002.safetensors",
239
+ "text_model.model.layers.2.ffn.fc2.weight": "model-00001-of-00002.safetensors",
240
+ "text_model.model.layers.2.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
241
+ "text_model.model.layers.2.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
242
+ "text_model.model.layers.2.final_layer_norm.bias": "model-00001-of-00002.safetensors",
243
+ "text_model.model.layers.2.final_layer_norm.weight": "model-00001-of-00002.safetensors",
244
+ "text_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
245
+ "text_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
246
+ "text_model.model.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
247
+ "text_model.model.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
248
+ "text_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
249
+ "text_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
250
+ "text_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
251
+ "text_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
252
+ "text_model.model.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
253
+ "text_model.model.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
254
+ "text_model.model.layers.20.ffn.fc1.bias": "model-00001-of-00002.safetensors",
255
+ "text_model.model.layers.20.ffn.fc1.weight": "model-00001-of-00002.safetensors",
256
+ "text_model.model.layers.20.ffn.fc2.bias": "model-00001-of-00002.safetensors",
257
+ "text_model.model.layers.20.ffn.fc2.weight": "model-00001-of-00002.safetensors",
258
+ "text_model.model.layers.20.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
259
+ "text_model.model.layers.20.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
260
+ "text_model.model.layers.20.final_layer_norm.bias": "model-00001-of-00002.safetensors",
261
+ "text_model.model.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
262
+ "text_model.model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
263
+ "text_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
264
+ "text_model.model.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
265
+ "text_model.model.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
266
+ "text_model.model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
267
+ "text_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
268
+ "text_model.model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
269
+ "text_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
270
+ "text_model.model.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
271
+ "text_model.model.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
272
+ "text_model.model.layers.21.ffn.fc1.bias": "model-00001-of-00002.safetensors",
273
+ "text_model.model.layers.21.ffn.fc1.weight": "model-00001-of-00002.safetensors",
274
+ "text_model.model.layers.21.ffn.fc2.bias": "model-00001-of-00002.safetensors",
275
+ "text_model.model.layers.21.ffn.fc2.weight": "model-00001-of-00002.safetensors",
276
+ "text_model.model.layers.21.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
277
+ "text_model.model.layers.21.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
278
+ "text_model.model.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
279
+ "text_model.model.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
280
+ "text_model.model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
281
+ "text_model.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
282
+ "text_model.model.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
283
+ "text_model.model.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
284
+ "text_model.model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
285
+ "text_model.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
286
+ "text_model.model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
287
+ "text_model.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
288
+ "text_model.model.layers.21.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
289
+ "text_model.model.layers.21.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
290
+ "text_model.model.layers.22.ffn.fc1.bias": "model-00001-of-00002.safetensors",
291
+ "text_model.model.layers.22.ffn.fc1.weight": "model-00001-of-00002.safetensors",
292
+ "text_model.model.layers.22.ffn.fc2.bias": "model-00001-of-00002.safetensors",
293
+ "text_model.model.layers.22.ffn.fc2.weight": "model-00001-of-00002.safetensors",
294
+ "text_model.model.layers.22.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
295
+ "text_model.model.layers.22.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
296
+ "text_model.model.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
297
+ "text_model.model.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
298
+ "text_model.model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
299
+ "text_model.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
300
+ "text_model.model.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
301
+ "text_model.model.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
302
+ "text_model.model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
303
+ "text_model.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
304
+ "text_model.model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
305
+ "text_model.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
306
+ "text_model.model.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
307
+ "text_model.model.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
308
+ "text_model.model.layers.23.ffn.fc1.bias": "model-00001-of-00002.safetensors",
309
+ "text_model.model.layers.23.ffn.fc1.weight": "model-00001-of-00002.safetensors",
310
+ "text_model.model.layers.23.ffn.fc2.bias": "model-00001-of-00002.safetensors",
311
+ "text_model.model.layers.23.ffn.fc2.weight": "model-00001-of-00002.safetensors",
312
+ "text_model.model.layers.23.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
313
+ "text_model.model.layers.23.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
314
+ "text_model.model.layers.23.final_layer_norm.bias": "model-00001-of-00002.safetensors",
315
+ "text_model.model.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
316
+ "text_model.model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
317
+ "text_model.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
318
+ "text_model.model.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
319
+ "text_model.model.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
320
+ "text_model.model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
321
+ "text_model.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
322
+ "text_model.model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
323
+ "text_model.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
324
+ "text_model.model.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
325
+ "text_model.model.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
326
+ "text_model.model.layers.3.ffn.fc1.bias": "model-00001-of-00002.safetensors",
327
+ "text_model.model.layers.3.ffn.fc1.weight": "model-00001-of-00002.safetensors",
328
+ "text_model.model.layers.3.ffn.fc2.bias": "model-00001-of-00002.safetensors",
329
+ "text_model.model.layers.3.ffn.fc2.weight": "model-00001-of-00002.safetensors",
330
+ "text_model.model.layers.3.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
331
+ "text_model.model.layers.3.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
332
+ "text_model.model.layers.3.final_layer_norm.bias": "model-00001-of-00002.safetensors",
333
+ "text_model.model.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
334
+ "text_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
335
+ "text_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
336
+ "text_model.model.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
337
+ "text_model.model.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
338
+ "text_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
339
+ "text_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
340
+ "text_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
341
+ "text_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
342
+ "text_model.model.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
343
+ "text_model.model.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
344
+ "text_model.model.layers.4.ffn.fc1.bias": "model-00001-of-00002.safetensors",
345
+ "text_model.model.layers.4.ffn.fc1.weight": "model-00001-of-00002.safetensors",
346
+ "text_model.model.layers.4.ffn.fc2.bias": "model-00001-of-00002.safetensors",
347
+ "text_model.model.layers.4.ffn.fc2.weight": "model-00001-of-00002.safetensors",
348
+ "text_model.model.layers.4.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
349
+ "text_model.model.layers.4.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "text_model.model.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
351
+ "text_model.model.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
352
+ "text_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
353
+ "text_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
354
+ "text_model.model.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
355
+ "text_model.model.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
356
+ "text_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
357
+ "text_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
358
+ "text_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
359
+ "text_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "text_model.model.layers.4.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
361
+ "text_model.model.layers.4.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
362
+ "text_model.model.layers.5.ffn.fc1.bias": "model-00001-of-00002.safetensors",
363
+ "text_model.model.layers.5.ffn.fc1.weight": "model-00001-of-00002.safetensors",
364
+ "text_model.model.layers.5.ffn.fc2.bias": "model-00001-of-00002.safetensors",
365
+ "text_model.model.layers.5.ffn.fc2.weight": "model-00001-of-00002.safetensors",
366
+ "text_model.model.layers.5.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
367
+ "text_model.model.layers.5.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
368
+ "text_model.model.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
369
+ "text_model.model.layers.5.final_layer_norm.weight": "model-00001-of-00002.safetensors",
370
+ "text_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
371
+ "text_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
372
+ "text_model.model.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
373
+ "text_model.model.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
374
+ "text_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
375
+ "text_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
376
+ "text_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
377
+ "text_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
378
+ "text_model.model.layers.5.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
379
+ "text_model.model.layers.5.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
380
+ "text_model.model.layers.6.ffn.fc1.bias": "model-00001-of-00002.safetensors",
381
+ "text_model.model.layers.6.ffn.fc1.weight": "model-00001-of-00002.safetensors",
382
+ "text_model.model.layers.6.ffn.fc2.bias": "model-00001-of-00002.safetensors",
383
+ "text_model.model.layers.6.ffn.fc2.weight": "model-00001-of-00002.safetensors",
384
+ "text_model.model.layers.6.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
385
+ "text_model.model.layers.6.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
386
+ "text_model.model.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
387
+ "text_model.model.layers.6.final_layer_norm.weight": "model-00001-of-00002.safetensors",
388
+ "text_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
389
+ "text_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
390
+ "text_model.model.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
391
+ "text_model.model.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
392
+ "text_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
393
+ "text_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
394
+ "text_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
395
+ "text_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
396
+ "text_model.model.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
397
+ "text_model.model.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
398
+ "text_model.model.layers.7.ffn.fc1.bias": "model-00001-of-00002.safetensors",
399
+ "text_model.model.layers.7.ffn.fc1.weight": "model-00001-of-00002.safetensors",
400
+ "text_model.model.layers.7.ffn.fc2.bias": "model-00001-of-00002.safetensors",
401
+ "text_model.model.layers.7.ffn.fc2.weight": "model-00001-of-00002.safetensors",
402
+ "text_model.model.layers.7.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
403
+ "text_model.model.layers.7.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
404
+ "text_model.model.layers.7.final_layer_norm.bias": "model-00001-of-00002.safetensors",
405
+ "text_model.model.layers.7.final_layer_norm.weight": "model-00001-of-00002.safetensors",
406
+ "text_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
407
+ "text_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
408
+ "text_model.model.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
409
+ "text_model.model.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
410
+ "text_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
411
+ "text_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
412
+ "text_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
413
+ "text_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
414
+ "text_model.model.layers.7.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
415
+ "text_model.model.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
416
+ "text_model.model.layers.8.ffn.fc1.bias": "model-00001-of-00002.safetensors",
417
+ "text_model.model.layers.8.ffn.fc1.weight": "model-00001-of-00002.safetensors",
418
+ "text_model.model.layers.8.ffn.fc2.bias": "model-00001-of-00002.safetensors",
419
+ "text_model.model.layers.8.ffn.fc2.weight": "model-00001-of-00002.safetensors",
420
+ "text_model.model.layers.8.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
421
+ "text_model.model.layers.8.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
422
+ "text_model.model.layers.8.final_layer_norm.bias": "model-00001-of-00002.safetensors",
423
+ "text_model.model.layers.8.final_layer_norm.weight": "model-00001-of-00002.safetensors",
424
+ "text_model.model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
425
+ "text_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
426
+ "text_model.model.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
427
+ "text_model.model.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
428
+ "text_model.model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
429
+ "text_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
430
+ "text_model.model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
431
+ "text_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
432
+ "text_model.model.layers.8.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
433
+ "text_model.model.layers.8.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
434
+ "text_model.model.layers.9.ffn.fc1.bias": "model-00001-of-00002.safetensors",
435
+ "text_model.model.layers.9.ffn.fc1.weight": "model-00001-of-00002.safetensors",
436
+ "text_model.model.layers.9.ffn.fc2.bias": "model-00001-of-00002.safetensors",
437
+ "text_model.model.layers.9.ffn.fc2.weight": "model-00001-of-00002.safetensors",
438
+ "text_model.model.layers.9.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
439
+ "text_model.model.layers.9.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
440
+ "text_model.model.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
441
+ "text_model.model.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
442
+ "text_model.model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
443
+ "text_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
444
+ "text_model.model.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
445
+ "text_model.model.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
446
+ "text_model.model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
447
+ "text_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
448
+ "text_model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
449
+ "text_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
450
+ "text_model.model.layers.9.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
451
+ "text_model.model.layers.9.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
452
+ "text_model.model.segment_emb.weight": "model-00001-of-00002.safetensors",
453
+ "vision_model.embeddings.column_embedder.weight": "model-00001-of-00002.safetensors",
454
+ "vision_model.embeddings.patch_projection.bias": "model-00001-of-00002.safetensors",
455
+ "vision_model.embeddings.patch_projection.weight": "model-00001-of-00002.safetensors",
456
+ "vision_model.embeddings.row_embedder.weight": "model-00001-of-00002.safetensors",
457
+ "vision_model.encoder.layer.0.attention.key.weight": "model-00001-of-00002.safetensors",
458
+ "vision_model.encoder.layer.0.attention.output.weight": "model-00001-of-00002.safetensors",
459
+ "vision_model.encoder.layer.0.attention.query.weight": "model-00001-of-00002.safetensors",
460
+ "vision_model.encoder.layer.0.attention.value.weight": "model-00001-of-00002.safetensors",
461
+ "vision_model.encoder.layer.0.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
462
+ "vision_model.encoder.layer.0.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
463
+ "vision_model.encoder.layer.0.mlp.wo.weight": "model-00001-of-00002.safetensors",
464
+ "vision_model.encoder.layer.0.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
465
+ "vision_model.encoder.layer.0.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
466
+ "vision_model.encoder.layer.1.attention.key.weight": "model-00001-of-00002.safetensors",
467
+ "vision_model.encoder.layer.1.attention.output.weight": "model-00001-of-00002.safetensors",
468
+ "vision_model.encoder.layer.1.attention.query.weight": "model-00001-of-00002.safetensors",
469
+ "vision_model.encoder.layer.1.attention.value.weight": "model-00001-of-00002.safetensors",
470
+ "vision_model.encoder.layer.1.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
471
+ "vision_model.encoder.layer.1.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
472
+ "vision_model.encoder.layer.1.mlp.wo.weight": "model-00001-of-00002.safetensors",
473
+ "vision_model.encoder.layer.1.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
474
+ "vision_model.encoder.layer.1.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
475
+ "vision_model.encoder.layer.10.attention.key.weight": "model-00001-of-00002.safetensors",
476
+ "vision_model.encoder.layer.10.attention.output.weight": "model-00001-of-00002.safetensors",
477
+ "vision_model.encoder.layer.10.attention.query.weight": "model-00001-of-00002.safetensors",
478
+ "vision_model.encoder.layer.10.attention.value.weight": "model-00001-of-00002.safetensors",
479
+ "vision_model.encoder.layer.10.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
480
+ "vision_model.encoder.layer.10.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
481
+ "vision_model.encoder.layer.10.mlp.wo.weight": "model-00001-of-00002.safetensors",
482
+ "vision_model.encoder.layer.10.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
483
+ "vision_model.encoder.layer.10.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
484
+ "vision_model.encoder.layer.11.attention.key.weight": "model-00001-of-00002.safetensors",
485
+ "vision_model.encoder.layer.11.attention.output.weight": "model-00001-of-00002.safetensors",
486
+ "vision_model.encoder.layer.11.attention.query.weight": "model-00001-of-00002.safetensors",
487
+ "vision_model.encoder.layer.11.attention.value.weight": "model-00001-of-00002.safetensors",
488
+ "vision_model.encoder.layer.11.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
489
+ "vision_model.encoder.layer.11.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
490
+ "vision_model.encoder.layer.11.mlp.wo.weight": "model-00001-of-00002.safetensors",
491
+ "vision_model.encoder.layer.11.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
492
+ "vision_model.encoder.layer.11.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
493
+ "vision_model.encoder.layer.12.attention.key.weight": "model-00001-of-00002.safetensors",
494
+ "vision_model.encoder.layer.12.attention.output.weight": "model-00001-of-00002.safetensors",
495
+ "vision_model.encoder.layer.12.attention.query.weight": "model-00001-of-00002.safetensors",
496
+ "vision_model.encoder.layer.12.attention.value.weight": "model-00001-of-00002.safetensors",
497
+ "vision_model.encoder.layer.12.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
498
+ "vision_model.encoder.layer.12.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
499
+ "vision_model.encoder.layer.12.mlp.wo.weight": "model-00001-of-00002.safetensors",
500
+ "vision_model.encoder.layer.12.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
501
+ "vision_model.encoder.layer.12.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
502
+ "vision_model.encoder.layer.13.attention.key.weight": "model-00001-of-00002.safetensors",
503
+ "vision_model.encoder.layer.13.attention.output.weight": "model-00001-of-00002.safetensors",
504
+ "vision_model.encoder.layer.13.attention.query.weight": "model-00001-of-00002.safetensors",
505
+ "vision_model.encoder.layer.13.attention.value.weight": "model-00001-of-00002.safetensors",
506
+ "vision_model.encoder.layer.13.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
507
+ "vision_model.encoder.layer.13.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
508
+ "vision_model.encoder.layer.13.mlp.wo.weight": "model-00001-of-00002.safetensors",
509
+ "vision_model.encoder.layer.13.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
510
+ "vision_model.encoder.layer.13.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
511
+ "vision_model.encoder.layer.14.attention.key.weight": "model-00002-of-00002.safetensors",
512
+ "vision_model.encoder.layer.14.attention.output.weight": "model-00002-of-00002.safetensors",
513
+ "vision_model.encoder.layer.14.attention.query.weight": "model-00002-of-00002.safetensors",
514
+ "vision_model.encoder.layer.14.attention.value.weight": "model-00002-of-00002.safetensors",
515
+ "vision_model.encoder.layer.14.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
516
+ "vision_model.encoder.layer.14.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
517
+ "vision_model.encoder.layer.14.mlp.wo.weight": "model-00002-of-00002.safetensors",
518
+ "vision_model.encoder.layer.14.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
519
+ "vision_model.encoder.layer.14.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
520
+ "vision_model.encoder.layer.15.attention.key.weight": "model-00002-of-00002.safetensors",
521
+ "vision_model.encoder.layer.15.attention.output.weight": "model-00002-of-00002.safetensors",
522
+ "vision_model.encoder.layer.15.attention.query.weight": "model-00002-of-00002.safetensors",
523
+ "vision_model.encoder.layer.15.attention.value.weight": "model-00002-of-00002.safetensors",
524
+ "vision_model.encoder.layer.15.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
525
+ "vision_model.encoder.layer.15.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
526
+ "vision_model.encoder.layer.15.mlp.wo.weight": "model-00002-of-00002.safetensors",
527
+ "vision_model.encoder.layer.15.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
528
+ "vision_model.encoder.layer.15.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
529
+ "vision_model.encoder.layer.16.attention.key.weight": "model-00002-of-00002.safetensors",
530
+ "vision_model.encoder.layer.16.attention.output.weight": "model-00002-of-00002.safetensors",
531
+ "vision_model.encoder.layer.16.attention.query.weight": "model-00002-of-00002.safetensors",
532
+ "vision_model.encoder.layer.16.attention.value.weight": "model-00002-of-00002.safetensors",
533
+ "vision_model.encoder.layer.16.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
534
+ "vision_model.encoder.layer.16.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
535
+ "vision_model.encoder.layer.16.mlp.wo.weight": "model-00002-of-00002.safetensors",
536
+ "vision_model.encoder.layer.16.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
537
+ "vision_model.encoder.layer.16.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
538
+ "vision_model.encoder.layer.17.attention.key.weight": "model-00002-of-00002.safetensors",
539
+ "vision_model.encoder.layer.17.attention.output.weight": "model-00002-of-00002.safetensors",
540
+ "vision_model.encoder.layer.17.attention.query.weight": "model-00002-of-00002.safetensors",
541
+ "vision_model.encoder.layer.17.attention.value.weight": "model-00002-of-00002.safetensors",
542
+ "vision_model.encoder.layer.17.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
543
+ "vision_model.encoder.layer.17.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
544
+ "vision_model.encoder.layer.17.mlp.wo.weight": "model-00002-of-00002.safetensors",
545
+ "vision_model.encoder.layer.17.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
546
+ "vision_model.encoder.layer.17.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
547
+ "vision_model.encoder.layer.2.attention.key.weight": "model-00001-of-00002.safetensors",
548
+ "vision_model.encoder.layer.2.attention.output.weight": "model-00001-of-00002.safetensors",
549
+ "vision_model.encoder.layer.2.attention.query.weight": "model-00001-of-00002.safetensors",
550
+ "vision_model.encoder.layer.2.attention.value.weight": "model-00001-of-00002.safetensors",
551
+ "vision_model.encoder.layer.2.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
552
+ "vision_model.encoder.layer.2.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
553
+ "vision_model.encoder.layer.2.mlp.wo.weight": "model-00001-of-00002.safetensors",
554
+ "vision_model.encoder.layer.2.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
555
+ "vision_model.encoder.layer.2.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
556
+ "vision_model.encoder.layer.3.attention.key.weight": "model-00001-of-00002.safetensors",
557
+ "vision_model.encoder.layer.3.attention.output.weight": "model-00001-of-00002.safetensors",
558
+ "vision_model.encoder.layer.3.attention.query.weight": "model-00001-of-00002.safetensors",
559
+ "vision_model.encoder.layer.3.attention.value.weight": "model-00001-of-00002.safetensors",
560
+ "vision_model.encoder.layer.3.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
561
+ "vision_model.encoder.layer.3.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
562
+ "vision_model.encoder.layer.3.mlp.wo.weight": "model-00001-of-00002.safetensors",
563
+ "vision_model.encoder.layer.3.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
564
+ "vision_model.encoder.layer.3.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
565
+ "vision_model.encoder.layer.4.attention.key.weight": "model-00001-of-00002.safetensors",
566
+ "vision_model.encoder.layer.4.attention.output.weight": "model-00001-of-00002.safetensors",
567
+ "vision_model.encoder.layer.4.attention.query.weight": "model-00001-of-00002.safetensors",
568
+ "vision_model.encoder.layer.4.attention.value.weight": "model-00001-of-00002.safetensors",
569
+ "vision_model.encoder.layer.4.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
570
+ "vision_model.encoder.layer.4.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
571
+ "vision_model.encoder.layer.4.mlp.wo.weight": "model-00001-of-00002.safetensors",
572
+ "vision_model.encoder.layer.4.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
573
+ "vision_model.encoder.layer.4.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
574
+ "vision_model.encoder.layer.5.attention.key.weight": "model-00001-of-00002.safetensors",
575
+ "vision_model.encoder.layer.5.attention.output.weight": "model-00001-of-00002.safetensors",
576
+ "vision_model.encoder.layer.5.attention.query.weight": "model-00001-of-00002.safetensors",
577
+ "vision_model.encoder.layer.5.attention.value.weight": "model-00001-of-00002.safetensors",
578
+ "vision_model.encoder.layer.5.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
579
+ "vision_model.encoder.layer.5.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
580
+ "vision_model.encoder.layer.5.mlp.wo.weight": "model-00001-of-00002.safetensors",
581
+ "vision_model.encoder.layer.5.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
582
+ "vision_model.encoder.layer.5.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
583
+ "vision_model.encoder.layer.6.attention.key.weight": "model-00001-of-00002.safetensors",
584
+ "vision_model.encoder.layer.6.attention.output.weight": "model-00001-of-00002.safetensors",
585
+ "vision_model.encoder.layer.6.attention.query.weight": "model-00001-of-00002.safetensors",
586
+ "vision_model.encoder.layer.6.attention.value.weight": "model-00001-of-00002.safetensors",
587
+ "vision_model.encoder.layer.6.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
588
+ "vision_model.encoder.layer.6.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
589
+ "vision_model.encoder.layer.6.mlp.wo.weight": "model-00001-of-00002.safetensors",
590
+ "vision_model.encoder.layer.6.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
591
+ "vision_model.encoder.layer.6.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
592
+ "vision_model.encoder.layer.7.attention.key.weight": "model-00001-of-00002.safetensors",
593
+ "vision_model.encoder.layer.7.attention.output.weight": "model-00001-of-00002.safetensors",
594
+ "vision_model.encoder.layer.7.attention.query.weight": "model-00001-of-00002.safetensors",
595
+ "vision_model.encoder.layer.7.attention.value.weight": "model-00001-of-00002.safetensors",
596
+ "vision_model.encoder.layer.7.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
597
+ "vision_model.encoder.layer.7.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
598
+ "vision_model.encoder.layer.7.mlp.wo.weight": "model-00001-of-00002.safetensors",
599
+ "vision_model.encoder.layer.7.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
600
+ "vision_model.encoder.layer.7.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
601
+ "vision_model.encoder.layer.8.attention.key.weight": "model-00001-of-00002.safetensors",
602
+ "vision_model.encoder.layer.8.attention.output.weight": "model-00001-of-00002.safetensors",
603
+ "vision_model.encoder.layer.8.attention.query.weight": "model-00001-of-00002.safetensors",
604
+ "vision_model.encoder.layer.8.attention.value.weight": "model-00001-of-00002.safetensors",
605
+ "vision_model.encoder.layer.8.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
606
+ "vision_model.encoder.layer.8.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
607
+ "vision_model.encoder.layer.8.mlp.wo.weight": "model-00001-of-00002.safetensors",
608
+ "vision_model.encoder.layer.8.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
609
+ "vision_model.encoder.layer.8.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
610
+ "vision_model.encoder.layer.9.attention.key.weight": "model-00001-of-00002.safetensors",
611
+ "vision_model.encoder.layer.9.attention.output.weight": "model-00001-of-00002.safetensors",
612
+ "vision_model.encoder.layer.9.attention.query.weight": "model-00001-of-00002.safetensors",
613
+ "vision_model.encoder.layer.9.attention.value.weight": "model-00001-of-00002.safetensors",
614
+ "vision_model.encoder.layer.9.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
615
+ "vision_model.encoder.layer.9.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
616
+ "vision_model.encoder.layer.9.mlp.wo.weight": "model-00001-of-00002.safetensors",
617
+ "vision_model.encoder.layer.9.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
618
+ "vision_model.encoder.layer.9.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
619
+ "vision_model.layernorm.weight": "model-00002-of-00002.safetensors"
620
+ }
621
+ }
models/kosmos-2.5-chat/preprocessor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_processor_type": "Kosmos2_5ImageProcessor",
3
+ "processor_class": "Kosmos2_5Processor"
4
+ }
5
+
models/kosmos-2.5-chat/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/microsoft/kosmos-2.5-chat
models/kosmos-2.5-chat/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<image>",
3
+ "bos_token": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "</image>",
11
+ "eos_token": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<s>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
models/kosmos-2.5-chat/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/kosmos-2.5-chat/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
models/kosmos-2.5-ft/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/kosmos-2.5-ft/README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - merve/doclaynet-small
5
+ base_model:
6
+ - microsoft/kosmos-2.5
7
+ pipeline_tag: image-text-to-text
8
+ library_name: transformers
9
+ tags:
10
+ - ocr
11
+ ---
12
+
13
+ ## Kosmos-2.5 Fine-tuned on DocLayNet
14
+
15
+ Kosmos-2.5 fine-tuned on grounded OCR (OCR with bounding boxes), find the script here: ([GH](https://github.com/merveenoyan/smol-vision/blob/main/Grounded_Fine_tuning%20GH.ipynb), [HF](https://huggingface.co/merve/smol-vision/blob/main/Grounded_Fine_tuning.ipynb))
16
+
17
+ Try the (base model) Kosmos-2.5 demo [here](https://huggingface.co/spaces/nielsr/kosmos-2.5-demo).
18
+
19
+ Here's the inference code:
20
+
21
+ ```python
22
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
23
+ import torch
24
+
25
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
26
+ processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
27
+
28
+ import requests
29
+ from PIL import Image
30
+ url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
31
+ image = Image.open(requests.get(url, stream=True).raw)
32
+
33
+
34
+ import re
35
+ prompt = "<ocr>"
36
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
37
+ height, width = inputs.pop("height"), inputs.pop("width")
38
+ raw_width, raw_height = image.size
39
+ scale_height = raw_height / height
40
+ scale_width = raw_width / width
41
+
42
+ inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
43
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
44
+
45
+ generated_ids = model.generate(
46
+ **inputs,
47
+ max_new_tokens=2000,
48
+ )
49
+
50
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
51
+
52
+ import re
53
+ from PIL import ImageDraw
54
+
55
+ def post_process(y, scale_height, scale_width):
56
+
57
+ pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
58
+ bboxes_raw = re.findall(pattern, y)
59
+ lines = re.split(pattern, y)[1:]
60
+ bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
61
+
62
+ out_lines = []
63
+ for i, box in enumerate(bboxes):
64
+ if len(box) != 4:
65
+ continue
66
+ x0, y0, x1, y1 = box
67
+
68
+ if x0 >= x1 or y0 >= y1:
69
+ continue
70
+
71
+ sx0 = int(x0 * scale_width)
72
+ sy0 = int(y0 * scale_height)
73
+ sx1 = int(x1 * scale_width)
74
+ sy1 = int(y1 * scale_height)
75
+
76
+ label = lines[i] if i < len(lines) else ""
77
+ label = label.lstrip(", ").strip()
78
+
79
+ out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
80
+
81
+ return "\n".join(out_lines)
82
+
83
+
84
+ output_text = post_process(generated_text[0], scale_height, scale_width)
85
+ print(output_text)
86
+
87
+ draw = ImageDraw.Draw(image)
88
+
89
+ for line in output_text.strip().splitlines():
90
+ coords = re.findall(r"-?\d+", line)[:8]
91
+ if len(coords) < 8:
92
+ continue
93
+ xy = list(map(int, coords))
94
+ draw.polygon(xy, outline="red")
95
+
96
+ image.save("output.png")
97
+ ```
98
+
99
+ The image and the text (shortened here) output:
100
+ ```
101
+ 338,17,673,17,673,82,338,82,CONFIDENTIAL
102
+ 445,68,478,68,478,97,445,97,-2-
103
+ 169,129,193,129,193,157,169,157,6.
104
+ 334,129,910,129,910,157,334,157,A suggestion that the light could have been produced by
105
+ 169,150,900,150,900,177,169,177,a photo-flash from a high-flying aircraft was discounted. No aircraft
106
+ 166,171,856,171,856,198,166,198,was heard at the time and, in any case, no known photo-flash has a
107
+ ...
108
+ ```
109
+
110
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/6141a88b3a0ec78603c9e784/fmSxfYyOOhMHLt4Mw9xWg.png)
111
+
models/kosmos-2.5-ft/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Kosmos2_5ForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "latent_query_num": 2048,
7
+ "model_type": "kosmos-2.5",
8
+ "text_config": {
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "gelu",
11
+ "attention_dropout": 0.0,
12
+ "attention_heads": 16,
13
+ "dropout": 0,
14
+ "dtype": "bfloat16",
15
+ "embed_dim": 1536,
16
+ "ffn_dim": 6144,
17
+ "init_std": 0.02,
18
+ "layer_norm_eps": 1e-05,
19
+ "layerdrop": 0.0,
20
+ "layers": 24,
21
+ "max_position_embeddings": 4096,
22
+ "model_type": "kosmos_2_5_text_model",
23
+ "scale_embedding": true,
24
+ "use_cache": true,
25
+ "vocab_size": 108481
26
+ },
27
+ "transformers_version": "4.56.1",
28
+ "vision_config": {
29
+ "attention_dropout": 0.0,
30
+ "dense_act_fn": "gelu_new",
31
+ "dropout_rate": 0.0,
32
+ "dtype": "bfloat16",
33
+ "head_dim": 64,
34
+ "hidden_size": 1536,
35
+ "initializer_factor": 1.0,
36
+ "initializer_range": 1e-10,
37
+ "intermediate_size": 3968,
38
+ "layer_norm_eps": 1e-06,
39
+ "max_length": 4096,
40
+ "max_num_patches": 4096,
41
+ "model_type": "kosmos_2_5_vision_model",
42
+ "num_attention_heads": 24,
43
+ "num_hidden_layers": 18,
44
+ "patch_embed_hidden_size": 768
45
+ }
46
+ }
models/kosmos-2.5-ft/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "eos_token_id": 2,
4
+ "num_beam": 1,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.56.1"
7
+ }
models/kosmos-2.5-ft/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b51c3ece1fdebe5dcc63a6079ddbae5a3a8f565ad054600b406a11eaa0fc768d
3
+ size 2749368352
models/kosmos-2.5-ft/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72903c0342414f6579137b6bf964247c48f931461f653622ceeb02a716dd60c1
3
+ size 5499116125
models/kosmos-2.5-ft/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a794a6cb9cd4bbd0c53d08db0e20a5536c789bba6f22113385c1c408d58908bd
3
+ size 14645
models/kosmos-2.5-ft/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d3e048a1e5c9dc7581e3872c3b16feadec0e02e34c6509590158830c91d1422
3
+ size 1465
models/kosmos-2.5-ft/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/merve/kosmos-2.5-ft
models/kosmos-2.5-ft/trainer_state.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1126,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.17777777777777778,
14
+ "grad_norm": 1.1875,
15
+ "learning_rate": 1.827402135231317e-05,
16
+ "loss": 0.4486,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.35555555555555557,
21
+ "grad_norm": 1.21875,
22
+ "learning_rate": 1.6494661921708185e-05,
23
+ "loss": 0.1052,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.5333333333333333,
28
+ "grad_norm": 0.90625,
29
+ "learning_rate": 1.4715302491103204e-05,
30
+ "loss": 0.0932,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.7111111111111111,
35
+ "grad_norm": 1.3984375,
36
+ "learning_rate": 1.2935943060498222e-05,
37
+ "loss": 0.0988,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.8888888888888888,
42
+ "grad_norm": 1.5703125,
43
+ "learning_rate": 1.1156583629893238e-05,
44
+ "loss": 0.0903,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 1.0657777777777777,
49
+ "grad_norm": 1.265625,
50
+ "learning_rate": 9.377224199288258e-06,
51
+ "loss": 0.0912,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 1.2435555555555555,
56
+ "grad_norm": 0.65234375,
57
+ "learning_rate": 7.597864768683275e-06,
58
+ "loss": 0.0798,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 1.4213333333333333,
63
+ "grad_norm": 1.890625,
64
+ "learning_rate": 5.818505338078292e-06,
65
+ "loss": 0.0835,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 1.5991111111111111,
70
+ "grad_norm": 2.015625,
71
+ "learning_rate": 4.03914590747331e-06,
72
+ "loss": 0.0785,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 1.7768888888888887,
77
+ "grad_norm": 1.546875,
78
+ "learning_rate": 2.2597864768683274e-06,
79
+ "loss": 0.0785,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 1.9546666666666668,
84
+ "grad_norm": 2.15625,
85
+ "learning_rate": 4.804270462633452e-07,
86
+ "loss": 0.079,
87
+ "step": 1100
88
+ }
89
+ ],
90
+ "logging_steps": 100,
91
+ "max_steps": 1126,
92
+ "num_input_tokens_seen": 0,
93
+ "num_train_epochs": 2,
94
+ "save_steps": 1000,
95
+ "stateful_callbacks": {
96
+ "TrainerControl": {
97
+ "args": {
98
+ "should_epoch_stop": false,
99
+ "should_evaluate": false,
100
+ "should_log": false,
101
+ "should_save": true,
102
+ "should_training_stop": true
103
+ },
104
+ "attributes": {}
105
+ }
106
+ },
107
+ "total_flos": 1.1068310033650483e+17,
108
+ "train_batch_size": 1,
109
+ "trial_name": null,
110
+ "trial_params": null
111
+ }
models/kosmos-2.5-ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285a70ced9abc4734407c103d93dd57bdfb7ce4329159887d23b785dbb4645a4
3
+ size 5777
models/kosmos-2.5/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ output.png filter=lfs diff=lfs merge=lfs -text
37
+ receipt_00008.png filter=lfs diff=lfs merge=lfs -text
models/kosmos-2.5/README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ library_name: transformers
5
+ pipeline_tag: image-text-to-text
6
+ ---
7
+ # Kosmos-2.5
8
+
9
+ [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
10
+
11
+ ## Model description
12
+
13
+ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
14
+
15
+ [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
16
+
17
+ ## NOTE:
18
+ Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
19
+
20
+ ## Inference
21
+
22
+ KOSMOS-2.5 is supported from Transformers >= 4.56. Find the docs [here](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5).
23
+
24
+ **Markdown Task:** For usage instructions, please refer to [md.py](md.py).
25
+
26
+ ```py
27
+ import re
28
+ import torch
29
+ import requests
30
+ from PIL import Image, ImageDraw
31
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
32
+
33
+ repo = "microsoft/kosmos-2.5"
34
+ device = "cuda:0"
35
+ dtype = torch.bfloat16
36
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
37
+ processor = AutoProcessor.from_pretrained(repo)
38
+
39
+ # sample image
40
+ url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
41
+ image = Image.open(requests.get(url, stream=True).raw)
42
+
43
+ prompt = "<md>"
44
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
45
+
46
+ height, width = inputs.pop("height"), inputs.pop("width")
47
+ raw_width, raw_height = image.size
48
+ scale_height = raw_height / height
49
+ scale_width = raw_width / width
50
+
51
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
52
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
53
+ generated_ids = model.generate(
54
+ **inputs,
55
+ max_new_tokens=1024,
56
+ )
57
+
58
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
59
+ print(generated_text[0])
60
+ ```
61
+
62
+ **OCR Task:** For usage instructions, please refer to [ocr.py](ocr.py).
63
+
64
+ ```py
65
+ import re
66
+ import torch
67
+ import requests
68
+ from PIL import Image, ImageDraw
69
+ from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
70
+
71
+ repo = "microsoft/kosmos-2.5"
72
+ device = "cuda:0"
73
+ dtype = torch.bfloat16
74
+ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
75
+ processor = AutoProcessor.from_pretrained(repo)
76
+
77
+ # sample image
78
+ url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
79
+ image = Image.open(requests.get(url, stream=True).raw)
80
+
81
+ # bs = 1
82
+ prompt = "<ocr>"
83
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
84
+ height, width = inputs.pop("height"), inputs.pop("width")
85
+ raw_width, raw_height = image.size
86
+ scale_height = raw_height / height
87
+ scale_width = raw_width / width
88
+
89
+ # bs > 1, batch generation
90
+ # inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
91
+ # height, width = inputs.pop("height"), inputs.pop("width")
92
+ # raw_width, raw_height = image.size
93
+ # scale_height = raw_height / height[0]
94
+ # scale_width = raw_width / width[0]
95
+
96
+ inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
97
+ inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
98
+ generated_ids = model.generate(
99
+ **inputs,
100
+ max_new_tokens=1024,
101
+ )
102
+
103
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
104
+ def post_process(y, scale_height, scale_width):
105
+ y = y.replace(prompt, "")
106
+ if "<md>" in prompt:
107
+ return y
108
+ pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
109
+ bboxs_raw = re.findall(pattern, y)
110
+ lines = re.split(pattern, y)[1:]
111
+ bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
112
+ bboxs = [[int(j) for j in i] for i in bboxs]
113
+ info = ""
114
+ for i in range(len(lines)):
115
+ box = bboxs[i]
116
+ x0, y0, x1, y1 = box
117
+ if not (x0 >= x1 or y0 >= y1):
118
+ x0 = int(x0 * scale_width)
119
+ y0 = int(y0 * scale_height)
120
+ x1 = int(x1 * scale_width)
121
+ y1 = int(y1 * scale_height)
122
+ info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
123
+ return info
124
+
125
+ output_text = post_process(generated_text[0], scale_height, scale_width)
126
+ print(output_text)
127
+
128
+ draw = ImageDraw.Draw(image)
129
+ lines = output_text.split("\n")
130
+ for line in lines:
131
+ # draw the bounding box
132
+ line = list(line.split(","))
133
+ if len(line) < 8:
134
+ continue
135
+ line = list(map(int, line[:8]))
136
+ draw.polygon(line, outline="red")
137
+ image.save("output.png")
138
+ ```
139
+
140
+ ## Citation
141
+
142
+ If you find Kosmos-2.5 useful in your research, please cite the following paper:
143
+
144
+ ```
145
+ @article{lv2023kosmos,
146
+ title={Kosmos-2.5: A multimodal literate model},
147
+ author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
148
+ journal={arXiv preprint arXiv:2309.11419},
149
+ year={2023}
150
+ }
151
+ ```
152
+
153
+ ## License
154
+ The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
155
+
156
+ [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
models/kosmos-2.5/ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:676d0efd1dddf3785644918dd598d7734f9ed6e3eb59f806299ca8b7aefa0967
3
+ size 6165757107