Kosmos (code, demo, models, paper)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +5 -0
- KOSMOS-2.5. A Multimodal Literate Model.pdf +3 -0
- Kosmos-2. Grounding Multimodal Large Language Models to the World.pdf +3 -0
- Language Is Not All You Need. Aligning Perception with Language Models.pdf +3 -0
- code/Kosmos2.5 (kyegomez).zip +3 -0
- code/kosmos2.5-8bitq-onnx.zip +3 -0
- code/kosmos2.5-int8-quantization.zip +3 -0
- demo/kosmos-2.5-demo/.gitattributes +35 -0
- demo/kosmos-2.5-demo/README.md +61 -0
- demo/kosmos-2.5-demo/app.py +315 -0
- demo/kosmos-2.5-demo/requirements.txt +7 -0
- demo/kosmos-2.5-demo/source.txt +1 -0
- models/kosmos-2.5-4bit-text/.gitattributes +35 -0
- models/kosmos-2.5-4bit-text/README.md +199 -0
- models/kosmos-2.5-4bit-text/config.json +26 -0
- models/kosmos-2.5-4bit-text/generation_config.json +7 -0
- models/kosmos-2.5-4bit-text/model.safetensors +3 -0
- models/kosmos-2.5-4bit-text/source.txt +1 -0
- models/kosmos-2.5-4bit-vision/.gitattributes +35 -0
- models/kosmos-2.5-4bit-vision/README.md +199 -0
- models/kosmos-2.5-4bit-vision/config.json +22 -0
- models/kosmos-2.5-4bit-vision/model.safetensors +3 -0
- models/kosmos-2.5-4bit-vision/source.txt +1 -0
- models/kosmos-2.5-chat/.gitattributes +35 -0
- models/kosmos-2.5-chat/README.md +87 -0
- models/kosmos-2.5-chat/chat.py +40 -0
- models/kosmos-2.5-chat/config.json +163 -0
- models/kosmos-2.5-chat/generation_config.json +8 -0
- models/kosmos-2.5-chat/model-00001-of-00002.safetensors +3 -0
- models/kosmos-2.5-chat/model-00002-of-00002.safetensors +3 -0
- models/kosmos-2.5-chat/model.safetensors.index.json +621 -0
- models/kosmos-2.5-chat/preprocessor_config.json +5 -0
- models/kosmos-2.5-chat/source.txt +1 -0
- models/kosmos-2.5-chat/special_tokens_map.json +33 -0
- models/kosmos-2.5-chat/tokenizer.json +0 -0
- models/kosmos-2.5-chat/tokenizer_config.json +0 -0
- models/kosmos-2.5-ft/.gitattributes +35 -0
- models/kosmos-2.5-ft/README.md +111 -0
- models/kosmos-2.5-ft/config.json +46 -0
- models/kosmos-2.5-ft/generation_config.json +7 -0
- models/kosmos-2.5-ft/model.safetensors +3 -0
- models/kosmos-2.5-ft/optimizer.pt +3 -0
- models/kosmos-2.5-ft/rng_state.pth +3 -0
- models/kosmos-2.5-ft/scheduler.pt +3 -0
- models/kosmos-2.5-ft/source.txt +1 -0
- models/kosmos-2.5-ft/trainer_state.json +111 -0
- models/kosmos-2.5-ft/training_args.bin +3 -0
- models/kosmos-2.5/.gitattributes +37 -0
- models/kosmos-2.5/README.md +156 -0
- models/kosmos-2.5/ckpt.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Kosmos-2.[[:space:]]Grounding[[:space:]]Multimodal[[:space:]]Large[[:space:]]Language[[:space:]]Models[[:space:]]to[[:space:]]the[[:space:]]World.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
KOSMOS-2.5.[[:space:]]A[[:space:]]Multimodal[[:space:]]Literate[[:space:]]Model.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
Language[[:space:]]Is[[:space:]]Not[[:space:]]All[[:space:]]You[[:space:]]Need.[[:space:]]Aligning[[:space:]]Perception[[:space:]]with[[:space:]]Language[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
models/kosmos-2.5/output.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
models/kosmos-2.5/receipt_00008.png filter=lfs diff=lfs merge=lfs -text
|
KOSMOS-2.5. A Multimodal Literate Model.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f659ae6c3696172faf4afa57c6c6d563d9fc026ca378ceccdbfb33a5e5ee20f1
|
| 3 |
+
size 6426197
|
Kosmos-2. Grounding Multimodal Large Language Models to the World.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:064214db82bb573831fd102ace00be90147633e7f47dac491089619e03bc7e58
|
| 3 |
+
size 7580509
|
Language Is Not All You Need. Aligning Perception with Language Models.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6a848b830a5ceabaf4fd1cbd38d32572edca05d7db513398bca459f3cf6352a
|
| 3 |
+
size 3743156
|
code/Kosmos2.5 (kyegomez).zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e20f0129cf0521f1b1e2b0b8d283b734644baade3f9a58545b6201d60f31eddc
|
| 3 |
+
size 522240
|
code/kosmos2.5-8bitq-onnx.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05f788b060b553a6c05a75982c60059ea1245911d17f693caabd64d1f959618c
|
| 3 |
+
size 68711
|
code/kosmos2.5-int8-quantization.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28621b029cd25851142c1d0670cf6d0f867e3845cd71c2766acfc7996dd41a06
|
| 3 |
+
size 62854
|
demo/kosmos-2.5-demo/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
demo/kosmos-2.5-demo/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: KOSMOS-2.5 Document AI Demo
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# KOSMOS-2.5 Document AI Demo
|
| 14 |
+
|
| 15 |
+
This Space demonstrates the capabilities of Microsoft's **KOSMOS-2.5**, a multimodal literate model for machine reading of text-intensive images.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
🔥 **Three powerful modes**:
|
| 20 |
+
|
| 21 |
+
1. **📝 Markdown Generation**: Convert document images to clean markdown format
|
| 22 |
+
2. **🔍 OCR with Bounding Boxes**: Extract text with precise spatial coordinates and visualization
|
| 23 |
+
3. **💬 Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
|
| 24 |
+
|
| 25 |
+
## What is KOSMOS-2.5?
|
| 26 |
+
|
| 27 |
+
KOSMOS-2.5 is Microsoft's latest document AI model that excels at understanding text-rich images. It can:
|
| 28 |
+
|
| 29 |
+
- Generate spatially-aware text blocks with coordinates
|
| 30 |
+
- Produce structured markdown output that captures document styles
|
| 31 |
+
- Answer questions about document content through the chat variant
|
| 32 |
+
|
| 33 |
+
The model was pre-trained on 357.4 million text-rich document images and achieves performance comparable to much larger models (1.3B vs 7B parameters) on visual question answering benchmarks.
|
| 34 |
+
|
| 35 |
+
## Example Use Cases
|
| 36 |
+
|
| 37 |
+
- **Receipts**: Extract itemized information or ask "What's the total amount?"
|
| 38 |
+
- **Forms**: Convert to structured format or query specific fields
|
| 39 |
+
- **Articles**: Get clean markdown or ask content-specific questions
|
| 40 |
+
- **Screenshots**: Extract UI text or get information about elements
|
| 41 |
+
|
| 42 |
+
## Model Information
|
| 43 |
+
|
| 44 |
+
- **Base Model**: [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5)
|
| 45 |
+
- **Chat Model**: [microsoft/kosmos-2.5-chat](https://huggingface.co/microsoft/kosmos-2.5-chat)
|
| 46 |
+
- **Paper**: [Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
|
| 47 |
+
|
| 48 |
+
## Note
|
| 49 |
+
|
| 50 |
+
This is a generative model and may occasionally produce inaccurate results. Please verify outputs for critical applications.
|
| 51 |
+
|
| 52 |
+
## Citation
|
| 53 |
+
|
| 54 |
+
```bibtex
|
| 55 |
+
@article{lv2023kosmos,
|
| 56 |
+
title={Kosmos-2.5: A multimodal literate model},
|
| 57 |
+
author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
|
| 58 |
+
journal={arXiv preprint arXiv:2309.11419},
|
| 59 |
+
year={2023}
|
| 60 |
+
}
|
| 61 |
+
```
|
demo/kosmos-2.5-demo/app.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
import torch
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
# Check if CUDA is available
|
| 9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
+
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 11 |
+
|
| 12 |
+
# Check if Flash Attention 2 is available
|
| 13 |
+
def is_flash_attention_available():
|
| 14 |
+
try:
|
| 15 |
+
import flash_attn
|
| 16 |
+
return True
|
| 17 |
+
except ImportError:
|
| 18 |
+
return False
|
| 19 |
+
|
| 20 |
+
# Initialize models and processors lazily
|
| 21 |
+
base_model = None
|
| 22 |
+
base_processor = None
|
| 23 |
+
chat_model = None
|
| 24 |
+
chat_processor = None
|
| 25 |
+
|
| 26 |
+
def load_base_model():
|
| 27 |
+
global base_model, base_processor
|
| 28 |
+
if base_model is None:
|
| 29 |
+
base_repo = "microsoft/kosmos-2.5"
|
| 30 |
+
|
| 31 |
+
# Use Flash Attention 2 if available, otherwise use default attention
|
| 32 |
+
model_kwargs = {
|
| 33 |
+
"device_map": "cuda",
|
| 34 |
+
"dtype": dtype,
|
| 35 |
+
}
|
| 36 |
+
if is_flash_attention_available():
|
| 37 |
+
model_kwargs["attn_implementation"] = "flash_attention_2"
|
| 38 |
+
|
| 39 |
+
base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
|
| 40 |
+
base_repo,
|
| 41 |
+
**model_kwargs
|
| 42 |
+
)
|
| 43 |
+
base_processor = AutoProcessor.from_pretrained(base_repo)
|
| 44 |
+
return base_model, base_processor
|
| 45 |
+
|
| 46 |
+
def load_chat_model():
|
| 47 |
+
global chat_model, chat_processor
|
| 48 |
+
if chat_model is None:
|
| 49 |
+
chat_repo = "microsoft/kosmos-2.5-chat"
|
| 50 |
+
|
| 51 |
+
# Use Flash Attention 2 if available, otherwise use default attention
|
| 52 |
+
model_kwargs = {
|
| 53 |
+
"device_map": "cuda",
|
| 54 |
+
"dtype": dtype,
|
| 55 |
+
}
|
| 56 |
+
if is_flash_attention_available():
|
| 57 |
+
model_kwargs["attn_implementation"] = "flash_attention_2"
|
| 58 |
+
|
| 59 |
+
chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
|
| 60 |
+
chat_repo,
|
| 61 |
+
**model_kwargs
|
| 62 |
+
)
|
| 63 |
+
chat_processor = AutoProcessor.from_pretrained(chat_repo)
|
| 64 |
+
return chat_model, chat_processor
|
| 65 |
+
|
| 66 |
+
def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
|
| 67 |
+
y = y.replace(prompt, "")
|
| 68 |
+
if "<md>" in prompt:
|
| 69 |
+
return y
|
| 70 |
+
|
| 71 |
+
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
|
| 72 |
+
bboxs_raw = re.findall(pattern, y)
|
| 73 |
+
lines = re.split(pattern, y)[1:]
|
| 74 |
+
bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
|
| 75 |
+
bboxs = [[int(j) for j in i] for i in bboxs]
|
| 76 |
+
|
| 77 |
+
info = ""
|
| 78 |
+
for i in range(len(lines)):
|
| 79 |
+
if i < len(bboxs):
|
| 80 |
+
box = bboxs[i]
|
| 81 |
+
x0, y0, x1, y1 = box
|
| 82 |
+
if not (x0 >= x1 or y0 >= y1):
|
| 83 |
+
x0 = int(x0 * scale_width)
|
| 84 |
+
y0 = int(y0 * scale_height)
|
| 85 |
+
x1 = int(x1 * scale_width)
|
| 86 |
+
y1 = int(y1 * scale_height)
|
| 87 |
+
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}\n"
|
| 88 |
+
return info.strip()
|
| 89 |
+
|
| 90 |
+
@spaces.GPU(duration=120)
|
| 91 |
+
def generate_markdown(image):
|
| 92 |
+
if image is None:
|
| 93 |
+
return "Please upload an image."
|
| 94 |
+
|
| 95 |
+
model, processor = load_base_model()
|
| 96 |
+
|
| 97 |
+
prompt = "<md>"
|
| 98 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 99 |
+
|
| 100 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 101 |
+
raw_width, raw_height = image.size
|
| 102 |
+
scale_height = raw_height / height
|
| 103 |
+
scale_width = raw_width / width
|
| 104 |
+
|
| 105 |
+
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
|
| 106 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 107 |
+
|
| 108 |
+
with torch.no_grad():
|
| 109 |
+
generated_ids = model.generate(
|
| 110 |
+
**inputs,
|
| 111 |
+
max_new_tokens=1024,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 115 |
+
result = generated_text[0].replace(prompt, "").strip()
|
| 116 |
+
|
| 117 |
+
return result
|
| 118 |
+
|
| 119 |
+
@spaces.GPU(duration=120)
|
| 120 |
+
def generate_ocr(image):
|
| 121 |
+
if image is None:
|
| 122 |
+
return "Please upload an image.", None
|
| 123 |
+
|
| 124 |
+
model, processor = load_base_model()
|
| 125 |
+
|
| 126 |
+
prompt = "<ocr>"
|
| 127 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 128 |
+
|
| 129 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 130 |
+
raw_width, raw_height = image.size
|
| 131 |
+
scale_height = raw_height / height
|
| 132 |
+
scale_width = raw_width / width
|
| 133 |
+
|
| 134 |
+
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
|
| 135 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 136 |
+
|
| 137 |
+
with torch.no_grad():
|
| 138 |
+
generated_ids = model.generate(
|
| 139 |
+
**inputs,
|
| 140 |
+
max_new_tokens=1024,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 144 |
+
|
| 145 |
+
# Post-process OCR output
|
| 146 |
+
output_text = post_process_ocr(generated_text[0], scale_height, scale_width)
|
| 147 |
+
|
| 148 |
+
# Create visualization
|
| 149 |
+
from PIL import ImageDraw
|
| 150 |
+
vis_image = image.copy()
|
| 151 |
+
draw = ImageDraw.Draw(vis_image)
|
| 152 |
+
|
| 153 |
+
lines = output_text.split("\n")
|
| 154 |
+
for line in lines:
|
| 155 |
+
if not line.strip():
|
| 156 |
+
continue
|
| 157 |
+
parts = line.split(",")
|
| 158 |
+
if len(parts) >= 8:
|
| 159 |
+
try:
|
| 160 |
+
coords = list(map(int, parts[:8]))
|
| 161 |
+
draw.polygon(coords, outline="red", width=2)
|
| 162 |
+
except:
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
return output_text, vis_image
|
| 166 |
+
|
| 167 |
+
@spaces.GPU(duration=120)
|
| 168 |
+
def generate_chat_response(image, question):
|
| 169 |
+
if image is None:
|
| 170 |
+
return "Please upload an image."
|
| 171 |
+
if not question.strip():
|
| 172 |
+
return "Please ask a question."
|
| 173 |
+
|
| 174 |
+
model, processor = load_chat_model()
|
| 175 |
+
|
| 176 |
+
template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
|
| 177 |
+
prompt = template.format(question)
|
| 178 |
+
|
| 179 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 180 |
+
|
| 181 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 182 |
+
raw_width, raw_height = image.size
|
| 183 |
+
scale_height = raw_height / height
|
| 184 |
+
scale_width = raw_width / width
|
| 185 |
+
|
| 186 |
+
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
|
| 187 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 188 |
+
|
| 189 |
+
with torch.no_grad():
|
| 190 |
+
generated_ids = model.generate(
|
| 191 |
+
**inputs,
|
| 192 |
+
max_new_tokens=1024,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 196 |
+
|
| 197 |
+
# Extract only the assistant's response
|
| 198 |
+
result = generated_text[0]
|
| 199 |
+
if "ASSISTANT:" in result:
|
| 200 |
+
result = result.split("ASSISTANT:")[-1].strip()
|
| 201 |
+
|
| 202 |
+
return result
|
| 203 |
+
|
| 204 |
+
# Create Gradio interface
|
| 205 |
+
with gr.Blocks(title="KOSMOS-2.5 Document AI Demo", theme=gr.themes.Soft()) as demo:
|
| 206 |
+
gr.Markdown("""
|
| 207 |
+
# KOSMOS-2.5 Document AI Demo
|
| 208 |
+
|
| 209 |
+
Explore Microsoft's KOSMOS-2.5, a multimodal model for reading text-intensive images!
|
| 210 |
+
This demo showcases three capabilities:
|
| 211 |
+
|
| 212 |
+
1. **Markdown Generation**: Convert document images to markdown format
|
| 213 |
+
2. **OCR with Bounding Boxes**: Extract text with spatial coordinates
|
| 214 |
+
3. **Document Q&A**: Ask questions about document content using KOSMOS-2.5 Chat
|
| 215 |
+
|
| 216 |
+
Upload a document image (receipt, form, article, etc.) and try different tasks!
|
| 217 |
+
""")
|
| 218 |
+
|
| 219 |
+
with gr.Tabs():
|
| 220 |
+
# Markdown Generation Tab
|
| 221 |
+
with gr.TabItem("📝 Markdown Generation"):
|
| 222 |
+
with gr.Row():
|
| 223 |
+
with gr.Column():
|
| 224 |
+
md_image = gr.Image(type="pil", label="Upload Document Image")
|
| 225 |
+
gr.Examples(
|
| 226 |
+
examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
|
| 227 |
+
inputs=md_image
|
| 228 |
+
)
|
| 229 |
+
md_button = gr.Button("Generate Markdown", variant="primary")
|
| 230 |
+
with gr.Column():
|
| 231 |
+
md_output = gr.Textbox(
|
| 232 |
+
label="Generated Markdown",
|
| 233 |
+
lines=15,
|
| 234 |
+
max_lines=20,
|
| 235 |
+
show_copy_button=True
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# OCR Tab
|
| 239 |
+
with gr.TabItem("🔍 OCR with Bounding Boxes"):
|
| 240 |
+
with gr.Row():
|
| 241 |
+
with gr.Column():
|
| 242 |
+
ocr_image = gr.Image(type="pil", label="Upload Document Image")
|
| 243 |
+
gr.Examples(
|
| 244 |
+
examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
|
| 245 |
+
inputs=ocr_image
|
| 246 |
+
)
|
| 247 |
+
ocr_button = gr.Button("Extract Text with Coordinates", variant="primary")
|
| 248 |
+
with gr.Column():
|
| 249 |
+
with gr.Row():
|
| 250 |
+
ocr_text = gr.Textbox(
|
| 251 |
+
label="Extracted Text with Coordinates",
|
| 252 |
+
lines=10,
|
| 253 |
+
show_copy_button=True
|
| 254 |
+
)
|
| 255 |
+
ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")
|
| 256 |
+
|
| 257 |
+
# Chat Tab
|
| 258 |
+
with gr.TabItem("💬 Document Q&A (Chat)"):
|
| 259 |
+
with gr.Row():
|
| 260 |
+
with gr.Column():
|
| 261 |
+
chat_image = gr.Image(type="pil", label="Upload Document Image")
|
| 262 |
+
gr.Examples(
|
| 263 |
+
examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
|
| 264 |
+
inputs=chat_image
|
| 265 |
+
)
|
| 266 |
+
chat_question = gr.Textbox(
|
| 267 |
+
label="Ask a question about the document",
|
| 268 |
+
placeholder="e.g., What is the total amount on this receipt?",
|
| 269 |
+
lines=2
|
| 270 |
+
)
|
| 271 |
+
gr.Examples(
|
| 272 |
+
examples=["What is the total amount on this receipt?", "What items were purchased?", "When was this receipt issued?", "What is the subtotal?"],
|
| 273 |
+
inputs=chat_question
|
| 274 |
+
)
|
| 275 |
+
chat_button = gr.Button("Get Answer", variant="primary")
|
| 276 |
+
with gr.Column():
|
| 277 |
+
chat_output = gr.Textbox(
|
| 278 |
+
label="Answer",
|
| 279 |
+
lines=8,
|
| 280 |
+
show_copy_button=True
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# Event handlers
|
| 284 |
+
md_button.click(
|
| 285 |
+
fn=generate_markdown,
|
| 286 |
+
inputs=[md_image],
|
| 287 |
+
outputs=[md_output]
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
ocr_button.click(
|
| 291 |
+
fn=generate_ocr,
|
| 292 |
+
inputs=[ocr_image],
|
| 293 |
+
outputs=[ocr_text, ocr_vis]
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
chat_button.click(
|
| 297 |
+
fn=generate_chat_response,
|
| 298 |
+
inputs=[chat_image, chat_question],
|
| 299 |
+
outputs=[chat_output]
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Examples section
|
| 303 |
+
gr.Markdown("""
|
| 304 |
+
## Example Use Cases:
|
| 305 |
+
- **Receipts**: Extract itemized information or ask about totals
|
| 306 |
+
- **Forms**: Convert to structured format or answer specific questions
|
| 307 |
+
- **Articles**: Get markdown format or ask about content
|
| 308 |
+
- **Screenshots**: Extract text or get information about specific elements
|
| 309 |
+
|
| 310 |
+
## Note:
|
| 311 |
+
This is a generative model and may occasionally hallucinate. Results should be verified for accuracy.
|
| 312 |
+
""")
|
| 313 |
+
|
| 314 |
+
if __name__ == "__main__":
|
| 315 |
+
demo.launch()
|
demo/kosmos-2.5-demo/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
git+https://github.com/huggingface/transformers.git
|
| 4 |
+
accelerate
|
| 5 |
+
pillow
|
| 6 |
+
requests
|
| 7 |
+
spaces
|
demo/kosmos-2.5-demo/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/spaces/nielsr/kosmos-2.5-demo
|
models/kosmos-2.5-4bit-text/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/kosmos-2.5-4bit-text/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
tags: []
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
models/kosmos-2.5-4bit-text/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.0,
|
| 3 |
+
"activation_function": "gelu",
|
| 4 |
+
"architectures": [
|
| 5 |
+
"Kosmos2_5TextForCausalLM"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attention_heads": 16,
|
| 9 |
+
"bos_token_id": 0,
|
| 10 |
+
"dropout": 0,
|
| 11 |
+
"embed_dim": 1536,
|
| 12 |
+
"eos_token_id": 2,
|
| 13 |
+
"ffn_dim": 6144,
|
| 14 |
+
"init_std": 0.02,
|
| 15 |
+
"layer_norm_eps": 1e-05,
|
| 16 |
+
"layerdrop": 0.0,
|
| 17 |
+
"layers": 24,
|
| 18 |
+
"max_position_embeddings": 4096,
|
| 19 |
+
"model_type": "kosmos_2_5_text_model",
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"scale_embedding": true,
|
| 22 |
+
"torch_dtype": "float16",
|
| 23 |
+
"transformers_version": "4.47.0.dev0",
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"vocab_size": 108481
|
| 26 |
+
}
|
models/kosmos-2.5-4bit-text/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 1,
|
| 6 |
+
"transformers_version": "4.47.0.dev0"
|
| 7 |
+
}
|
models/kosmos-2.5-4bit-text/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:487cc16ca412b5b3b4ed471a4f4b1f4a203cceaacb40a071bc8bbae794c38c84
|
| 3 |
+
size 717140144
|
models/kosmos-2.5-4bit-text/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Fireblossom/kosmos-2.5-4bit-text
|
models/kosmos-2.5-4bit-vision/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/kosmos-2.5-4bit-vision/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
tags: []
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
models/kosmos-2.5-4bit-vision/config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Kosmos2_5VisionModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"d_ff": 3968,
|
| 7 |
+
"d_kv": 64,
|
| 8 |
+
"dense_act_fn": "gelu_new",
|
| 9 |
+
"dropout_rate": 0.0,
|
| 10 |
+
"hidden_size": 1536,
|
| 11 |
+
"initializer_factor": 1.0,
|
| 12 |
+
"initializer_range": 1e-10,
|
| 13 |
+
"layer_norm_eps": 1e-06,
|
| 14 |
+
"max_length": 4096,
|
| 15 |
+
"model_type": "kosmos_2_5_vision_model",
|
| 16 |
+
"num_attention_heads": 24,
|
| 17 |
+
"num_hidden_layers": 18,
|
| 18 |
+
"patch_embed_hidden_size": 768,
|
| 19 |
+
"seq_len": 4096,
|
| 20 |
+
"torch_dtype": "float16",
|
| 21 |
+
"transformers_version": "4.47.0.dev0"
|
| 22 |
+
}
|
models/kosmos-2.5-4bit-vision/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab95bc9ba21aa3f673e447a4bb6ffbb694c38baf442d05a93bef748f11d8d1c7
|
| 3 |
+
size 306710574
|
models/kosmos-2.5-4bit-vision/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Fireblossom/kosmos-2.5-4bit-vision
|
models/kosmos-2.5-chat/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/kosmos-2.5-chat/README.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
license: mit
|
| 4 |
+
library_name: transformers
|
| 5 |
+
pipeline_tag: image-text-to-text
|
| 6 |
+
---
|
| 7 |
+
# Kosmos-2.5-chat
|
| 8 |
+
|
| 9 |
+
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
| 10 |
+
|
| 11 |
+
## Model description
|
| 12 |
+
Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
|
| 13 |
+
|
| 14 |
+
Kosmos-2.5-chat is a model specifically trained for Visual Question Answering (VQA) tasks, based on further training of Kosmos-2.5. For more details about Kosmos-2.5-chat, please refer to the paper.
|
| 15 |
+
|
| 16 |
+
[Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
|
| 17 |
+
|
| 18 |
+
## Usage
|
| 19 |
+
|
| 20 |
+
KOSMOS-2.5 is supported from Transformers >= 4.56. Find the docs [here](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5).
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
import re
|
| 24 |
+
import torch
|
| 25 |
+
import requests
|
| 26 |
+
from PIL import Image, ImageDraw
|
| 27 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
| 28 |
+
|
| 29 |
+
repo = "microsoft/kosmos-2.5-chat"
|
| 30 |
+
device = "cuda:0"
|
| 31 |
+
dtype = torch.bfloat16
|
| 32 |
+
|
| 33 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo,
|
| 34 |
+
device_map=device,
|
| 35 |
+
torch_dtype=dtype,
|
| 36 |
+
attn_implementation="flash_attention_2")
|
| 37 |
+
processor = AutoProcessor.from_pretrained(repo)
|
| 38 |
+
|
| 39 |
+
# sample image
|
| 40 |
+
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
| 41 |
+
|
| 42 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 43 |
+
|
| 44 |
+
question = "What is the sub total of the receipt?"
|
| 45 |
+
template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
|
| 46 |
+
prompt = template.format(question)
|
| 47 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 48 |
+
|
| 49 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 50 |
+
raw_width, raw_height = image.size
|
| 51 |
+
scale_height = raw_height / height
|
| 52 |
+
scale_width = raw_width / width
|
| 53 |
+
|
| 54 |
+
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 55 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 56 |
+
generated_ids = model.generate(
|
| 57 |
+
**inputs,
|
| 58 |
+
max_new_tokens=1024,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 62 |
+
print(generated_text[0])
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## NOTE:
|
| 66 |
+
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all results in the images.
|
| 67 |
+
|
| 68 |
+
## Inference
|
| 69 |
+
**Document Understanding Task:** For usage instructions, please refer to [chat.py](chat.py).
|
| 70 |
+
|
| 71 |
+
## Citation
|
| 72 |
+
|
| 73 |
+
If you find Kosmos-2.5-chat useful in your research, please cite the following paper:
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
@article{lv2023kosmos,
|
| 77 |
+
title={Kosmos-2.5: A multimodal literate model},
|
| 78 |
+
author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
|
| 79 |
+
journal={arXiv preprint arXiv:2309.11419},
|
| 80 |
+
year={2023}
|
| 81 |
+
}
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## License
|
| 85 |
+
The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
|
| 86 |
+
|
| 87 |
+
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
models/kosmos-2.5-chat/chat.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import torch
|
| 3 |
+
import requests
|
| 4 |
+
from PIL import Image, ImageDraw
|
| 5 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
| 6 |
+
|
| 7 |
+
repo = "microsoft/kosmos-2.5-chat"
|
| 8 |
+
device = "cuda:0"
|
| 9 |
+
dtype = torch.bfloat16
|
| 10 |
+
|
| 11 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo,
|
| 12 |
+
device_map=device,
|
| 13 |
+
torch_dtype=dtype,
|
| 14 |
+
attn_implementation="flash_attention_2")
|
| 15 |
+
processor = AutoProcessor.from_pretrained(repo)
|
| 16 |
+
|
| 17 |
+
# sample image
|
| 18 |
+
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
| 19 |
+
|
| 20 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 21 |
+
|
| 22 |
+
question = "What is the sub total of the receipt?"
|
| 23 |
+
template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
|
| 24 |
+
prompt = template.format(question)
|
| 25 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 26 |
+
|
| 27 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 28 |
+
raw_width, raw_height = image.size
|
| 29 |
+
scale_height = raw_height / height
|
| 30 |
+
scale_width = raw_width / width
|
| 31 |
+
|
| 32 |
+
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 33 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 34 |
+
generated_ids = model.generate(
|
| 35 |
+
**inputs,
|
| 36 |
+
max_new_tokens=1024,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 40 |
+
print(generated_text[0])
|
models/kosmos-2.5-chat/config.json
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Kosmos2_5ForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"latent_query_num": 2048,
|
| 6 |
+
"model_type": "kosmos-2.5",
|
| 7 |
+
"text_config": {
|
| 8 |
+
"_name_or_path": "",
|
| 9 |
+
"activation_dropout": 0.0,
|
| 10 |
+
"activation_function": "gelu",
|
| 11 |
+
"add_cross_attention": false,
|
| 12 |
+
"architectures": null,
|
| 13 |
+
"attention_dropout": 0.0,
|
| 14 |
+
"attention_heads": 16,
|
| 15 |
+
"bad_words_ids": null,
|
| 16 |
+
"begin_suppress_tokens": null,
|
| 17 |
+
"bos_token_id": 0,
|
| 18 |
+
"chunk_size_feed_forward": 0,
|
| 19 |
+
"cross_attention_hidden_size": null,
|
| 20 |
+
"decoder_start_token_id": null,
|
| 21 |
+
"diversity_penalty": 0.0,
|
| 22 |
+
"do_sample": false,
|
| 23 |
+
"dropout": 0.1,
|
| 24 |
+
"early_stopping": false,
|
| 25 |
+
"embed_dim": 1536,
|
| 26 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 27 |
+
"eos_token_id": 2,
|
| 28 |
+
"exponential_decay_length_penalty": null,
|
| 29 |
+
"ffn_dim": 6144,
|
| 30 |
+
"finetuning_task": null,
|
| 31 |
+
"forced_bos_token_id": null,
|
| 32 |
+
"forced_eos_token_id": null,
|
| 33 |
+
"id2label": {
|
| 34 |
+
"0": "LABEL_0",
|
| 35 |
+
"1": "LABEL_1"
|
| 36 |
+
},
|
| 37 |
+
"init_std": 0.02,
|
| 38 |
+
"is_decoder": false,
|
| 39 |
+
"is_encoder_decoder": false,
|
| 40 |
+
"label2id": {
|
| 41 |
+
"LABEL_0": 0,
|
| 42 |
+
"LABEL_1": 1
|
| 43 |
+
},
|
| 44 |
+
"layer_norm_eps": 1e-05,
|
| 45 |
+
"layerdrop": 0.0,
|
| 46 |
+
"length_penalty": 1.0,
|
| 47 |
+
"layers": 24,
|
| 48 |
+
"max_length": 20,
|
| 49 |
+
"max_position_embeddings": 4096,
|
| 50 |
+
"min_length": 0,
|
| 51 |
+
"model_type": "kosmos_2_5_text_model",
|
| 52 |
+
"no_repeat_ngram_size": 3,
|
| 53 |
+
"num_beam_groups": 1,
|
| 54 |
+
"num_beams": 1,
|
| 55 |
+
"num_return_sequences": 1,
|
| 56 |
+
"output_attentions": false,
|
| 57 |
+
"output_hidden_states": false,
|
| 58 |
+
"output_scores": false,
|
| 59 |
+
"pad_token_id": 1,
|
| 60 |
+
"prefix": null,
|
| 61 |
+
"problem_type": null,
|
| 62 |
+
"pruned_heads": {},
|
| 63 |
+
"remove_invalid_values": false,
|
| 64 |
+
"repetition_penalty": 1.0,
|
| 65 |
+
"return_dict": true,
|
| 66 |
+
"return_dict_in_generate": false,
|
| 67 |
+
"scale_embedding": true,
|
| 68 |
+
"sep_token_id": null,
|
| 69 |
+
"suppress_tokens": null,
|
| 70 |
+
"task_specific_params": null,
|
| 71 |
+
"temperature": 1.0,
|
| 72 |
+
"tf_legacy_loss": false,
|
| 73 |
+
"tie_encoder_decoder": false,
|
| 74 |
+
"tie_word_embeddings": true,
|
| 75 |
+
"tokenizer_class": null,
|
| 76 |
+
"top_k": 50,
|
| 77 |
+
"top_p": 1.0,
|
| 78 |
+
"torch_dtype": null,
|
| 79 |
+
"torchscript": false,
|
| 80 |
+
"typical_p": 1.0,
|
| 81 |
+
"use_bfloat16": false,
|
| 82 |
+
"use_cache": true,
|
| 83 |
+
"vocab_size": 108481
|
| 84 |
+
},
|
| 85 |
+
"torch_dtype": "float32",
|
| 86 |
+
"transformers_version": "4.43.3",
|
| 87 |
+
"vision_config": {
|
| 88 |
+
"_name_or_path": "",
|
| 89 |
+
"add_cross_attention": false,
|
| 90 |
+
"architectures": null,
|
| 91 |
+
"attention_dropout": 0.0,
|
| 92 |
+
"bad_words_ids": null,
|
| 93 |
+
"begin_suppress_tokens": null,
|
| 94 |
+
"bos_token_id": null,
|
| 95 |
+
"chunk_size_feed_forward": 0,
|
| 96 |
+
"cross_attention_hidden_size": null,
|
| 97 |
+
"intermediate_size": 3968,
|
| 98 |
+
"head_dim": 64,
|
| 99 |
+
"decoder_start_token_id": null,
|
| 100 |
+
"dense_act_fn": "gelu_new",
|
| 101 |
+
"diversity_penalty": 0.0,
|
| 102 |
+
"do_sample": false,
|
| 103 |
+
"dropout_rate": 0.0,
|
| 104 |
+
"early_stopping": false,
|
| 105 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 106 |
+
"eos_token_id": null,
|
| 107 |
+
"exponential_decay_length_penalty": null,
|
| 108 |
+
"finetuning_task": null,
|
| 109 |
+
"forced_bos_token_id": null,
|
| 110 |
+
"forced_eos_token_id": null,
|
| 111 |
+
"hidden_size": 1536,
|
| 112 |
+
"id2label": {
|
| 113 |
+
"0": "LABEL_0",
|
| 114 |
+
"1": "LABEL_1"
|
| 115 |
+
},
|
| 116 |
+
"initializer_factor": 1.0,
|
| 117 |
+
"initializer_range": 1e-10,
|
| 118 |
+
"is_decoder": false,
|
| 119 |
+
"is_encoder_decoder": false,
|
| 120 |
+
"label2id": {
|
| 121 |
+
"LABEL_0": 0,
|
| 122 |
+
"LABEL_1": 1
|
| 123 |
+
},
|
| 124 |
+
"layer_norm_eps": 1e-06,
|
| 125 |
+
"length_penalty": 1.0,
|
| 126 |
+
"max_length": 4096,
|
| 127 |
+
"min_length": 0,
|
| 128 |
+
"model_type": "kosmos_2_5_vision_model",
|
| 129 |
+
"no_repeat_ngram_size": 0,
|
| 130 |
+
"num_attention_heads": 24,
|
| 131 |
+
"num_beam_groups": 1,
|
| 132 |
+
"num_beams": 1,
|
| 133 |
+
"num_hidden_layers": 18,
|
| 134 |
+
"num_return_sequences": 1,
|
| 135 |
+
"output_attentions": false,
|
| 136 |
+
"output_hidden_states": false,
|
| 137 |
+
"output_scores": false,
|
| 138 |
+
"pad_token_id": null,
|
| 139 |
+
"patch_embed_hidden_size": 768,
|
| 140 |
+
"prefix": null,
|
| 141 |
+
"problem_type": null,
|
| 142 |
+
"pruned_heads": {},
|
| 143 |
+
"remove_invalid_values": false,
|
| 144 |
+
"repetition_penalty": 1.0,
|
| 145 |
+
"return_dict": true,
|
| 146 |
+
"return_dict_in_generate": false,
|
| 147 |
+
"sep_token_id": null,
|
| 148 |
+
"max_num_patches": 4096,
|
| 149 |
+
"suppress_tokens": null,
|
| 150 |
+
"task_specific_params": null,
|
| 151 |
+
"temperature": 1.0,
|
| 152 |
+
"tf_legacy_loss": false,
|
| 153 |
+
"tie_encoder_decoder": false,
|
| 154 |
+
"tie_word_embeddings": true,
|
| 155 |
+
"tokenizer_class": null,
|
| 156 |
+
"top_k": 50,
|
| 157 |
+
"top_p": 1.0,
|
| 158 |
+
"torch_dtype": null,
|
| 159 |
+
"torchscript": false,
|
| 160 |
+
"typical_p": 1.0,
|
| 161 |
+
"use_bfloat16": false
|
| 162 |
+
}
|
| 163 |
+
}
|
models/kosmos-2.5-chat/generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"no_repeat_ngram_size": 3,
|
| 6 |
+
"pad_token_id": 1,
|
| 7 |
+
"transformers_version": "4.43.3"
|
| 8 |
+
}
|
models/kosmos-2.5-chat/model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea32f178375c21412ee2829b2389682544ecd6f990a6120b219e065b1500d085
|
| 3 |
+
size 4995252144
|
models/kosmos-2.5-chat/model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b642fe2ce0ad3ff30838a3daebb6d26252770c65d29306a809e05598fc0e393
|
| 3 |
+
size 503408384
|
models/kosmos-2.5-chat/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 5498585088
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"image_to_text_projection.dense.bias": "model-00002-of-00002.safetensors",
|
| 7 |
+
"image_to_text_projection.dense.weight": "model-00002-of-00002.safetensors",
|
| 8 |
+
"image_to_text_projection.latent_query": "model-00002-of-00002.safetensors",
|
| 9 |
+
"image_to_text_projection.x_attn.k_proj.bias": "model-00002-of-00002.safetensors",
|
| 10 |
+
"image_to_text_projection.x_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 11 |
+
"image_to_text_projection.x_attn.out_proj.bias": "model-00002-of-00002.safetensors",
|
| 12 |
+
"image_to_text_projection.x_attn.out_proj.weight": "model-00002-of-00002.safetensors",
|
| 13 |
+
"image_to_text_projection.x_attn.q_proj.bias": "model-00002-of-00002.safetensors",
|
| 14 |
+
"image_to_text_projection.x_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 15 |
+
"image_to_text_projection.x_attn.v_proj.bias": "model-00002-of-00002.safetensors",
|
| 16 |
+
"image_to_text_projection.x_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 17 |
+
"text_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"text_model.model.layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 19 |
+
"text_model.model.layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"text_model.model.layers.0.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 21 |
+
"text_model.model.layers.0.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"text_model.model.layers.0.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 23 |
+
"text_model.model.layers.0.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"text_model.model.layers.0.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 25 |
+
"text_model.model.layers.0.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"text_model.model.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 27 |
+
"text_model.model.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"text_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 29 |
+
"text_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"text_model.model.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 31 |
+
"text_model.model.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"text_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 33 |
+
"text_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"text_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 35 |
+
"text_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"text_model.model.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 37 |
+
"text_model.model.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"text_model.model.layers.1.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 39 |
+
"text_model.model.layers.1.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"text_model.model.layers.1.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 41 |
+
"text_model.model.layers.1.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"text_model.model.layers.1.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 43 |
+
"text_model.model.layers.1.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"text_model.model.layers.1.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 45 |
+
"text_model.model.layers.1.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"text_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 47 |
+
"text_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"text_model.model.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 49 |
+
"text_model.model.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"text_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 51 |
+
"text_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"text_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 53 |
+
"text_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"text_model.model.layers.1.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 55 |
+
"text_model.model.layers.1.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"text_model.model.layers.10.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 57 |
+
"text_model.model.layers.10.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"text_model.model.layers.10.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 59 |
+
"text_model.model.layers.10.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"text_model.model.layers.10.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 61 |
+
"text_model.model.layers.10.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 62 |
+
"text_model.model.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 63 |
+
"text_model.model.layers.10.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"text_model.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 65 |
+
"text_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"text_model.model.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 67 |
+
"text_model.model.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"text_model.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 69 |
+
"text_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"text_model.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 71 |
+
"text_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"text_model.model.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 73 |
+
"text_model.model.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 74 |
+
"text_model.model.layers.11.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 75 |
+
"text_model.model.layers.11.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"text_model.model.layers.11.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 77 |
+
"text_model.model.layers.11.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"text_model.model.layers.11.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 79 |
+
"text_model.model.layers.11.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"text_model.model.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 81 |
+
"text_model.model.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"text_model.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 83 |
+
"text_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"text_model.model.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 85 |
+
"text_model.model.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"text_model.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 87 |
+
"text_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"text_model.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 89 |
+
"text_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"text_model.model.layers.11.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 91 |
+
"text_model.model.layers.11.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"text_model.model.layers.12.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 93 |
+
"text_model.model.layers.12.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"text_model.model.layers.12.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 95 |
+
"text_model.model.layers.12.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"text_model.model.layers.12.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 97 |
+
"text_model.model.layers.12.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"text_model.model.layers.12.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 99 |
+
"text_model.model.layers.12.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"text_model.model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 101 |
+
"text_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"text_model.model.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 103 |
+
"text_model.model.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"text_model.model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 105 |
+
"text_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"text_model.model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 107 |
+
"text_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"text_model.model.layers.12.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 109 |
+
"text_model.model.layers.12.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 110 |
+
"text_model.model.layers.13.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 111 |
+
"text_model.model.layers.13.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"text_model.model.layers.13.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 113 |
+
"text_model.model.layers.13.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"text_model.model.layers.13.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 115 |
+
"text_model.model.layers.13.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"text_model.model.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 117 |
+
"text_model.model.layers.13.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"text_model.model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 119 |
+
"text_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"text_model.model.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 121 |
+
"text_model.model.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"text_model.model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 123 |
+
"text_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"text_model.model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 125 |
+
"text_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"text_model.model.layers.13.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 127 |
+
"text_model.model.layers.13.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"text_model.model.layers.14.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 129 |
+
"text_model.model.layers.14.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"text_model.model.layers.14.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 131 |
+
"text_model.model.layers.14.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"text_model.model.layers.14.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 133 |
+
"text_model.model.layers.14.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"text_model.model.layers.14.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 135 |
+
"text_model.model.layers.14.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"text_model.model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 137 |
+
"text_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"text_model.model.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 139 |
+
"text_model.model.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"text_model.model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 141 |
+
"text_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"text_model.model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 143 |
+
"text_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"text_model.model.layers.14.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 145 |
+
"text_model.model.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"text_model.model.layers.15.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 147 |
+
"text_model.model.layers.15.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"text_model.model.layers.15.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 149 |
+
"text_model.model.layers.15.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"text_model.model.layers.15.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 151 |
+
"text_model.model.layers.15.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"text_model.model.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 153 |
+
"text_model.model.layers.15.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"text_model.model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 155 |
+
"text_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 156 |
+
"text_model.model.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 157 |
+
"text_model.model.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 158 |
+
"text_model.model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 159 |
+
"text_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"text_model.model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 161 |
+
"text_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"text_model.model.layers.15.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 163 |
+
"text_model.model.layers.15.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"text_model.model.layers.16.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 165 |
+
"text_model.model.layers.16.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 166 |
+
"text_model.model.layers.16.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 167 |
+
"text_model.model.layers.16.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 168 |
+
"text_model.model.layers.16.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 169 |
+
"text_model.model.layers.16.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 170 |
+
"text_model.model.layers.16.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 171 |
+
"text_model.model.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 172 |
+
"text_model.model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 173 |
+
"text_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 174 |
+
"text_model.model.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 175 |
+
"text_model.model.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 176 |
+
"text_model.model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 177 |
+
"text_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 178 |
+
"text_model.model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 179 |
+
"text_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 180 |
+
"text_model.model.layers.16.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 181 |
+
"text_model.model.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 182 |
+
"text_model.model.layers.17.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 183 |
+
"text_model.model.layers.17.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 184 |
+
"text_model.model.layers.17.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 185 |
+
"text_model.model.layers.17.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 186 |
+
"text_model.model.layers.17.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 187 |
+
"text_model.model.layers.17.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 188 |
+
"text_model.model.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 189 |
+
"text_model.model.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 190 |
+
"text_model.model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 191 |
+
"text_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 192 |
+
"text_model.model.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 193 |
+
"text_model.model.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 194 |
+
"text_model.model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 195 |
+
"text_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 196 |
+
"text_model.model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 197 |
+
"text_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 198 |
+
"text_model.model.layers.17.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 199 |
+
"text_model.model.layers.17.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 200 |
+
"text_model.model.layers.18.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 201 |
+
"text_model.model.layers.18.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 202 |
+
"text_model.model.layers.18.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 203 |
+
"text_model.model.layers.18.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 204 |
+
"text_model.model.layers.18.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 205 |
+
"text_model.model.layers.18.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 206 |
+
"text_model.model.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 207 |
+
"text_model.model.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 208 |
+
"text_model.model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 209 |
+
"text_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 210 |
+
"text_model.model.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 211 |
+
"text_model.model.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 212 |
+
"text_model.model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 213 |
+
"text_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 214 |
+
"text_model.model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 215 |
+
"text_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 216 |
+
"text_model.model.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 217 |
+
"text_model.model.layers.18.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 218 |
+
"text_model.model.layers.19.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 219 |
+
"text_model.model.layers.19.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 220 |
+
"text_model.model.layers.19.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 221 |
+
"text_model.model.layers.19.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 222 |
+
"text_model.model.layers.19.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 223 |
+
"text_model.model.layers.19.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 224 |
+
"text_model.model.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 225 |
+
"text_model.model.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 226 |
+
"text_model.model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 227 |
+
"text_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 228 |
+
"text_model.model.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 229 |
+
"text_model.model.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 230 |
+
"text_model.model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 231 |
+
"text_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 232 |
+
"text_model.model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 233 |
+
"text_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 234 |
+
"text_model.model.layers.19.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 235 |
+
"text_model.model.layers.19.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 236 |
+
"text_model.model.layers.2.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 237 |
+
"text_model.model.layers.2.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 238 |
+
"text_model.model.layers.2.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 239 |
+
"text_model.model.layers.2.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 240 |
+
"text_model.model.layers.2.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 241 |
+
"text_model.model.layers.2.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 242 |
+
"text_model.model.layers.2.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 243 |
+
"text_model.model.layers.2.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 244 |
+
"text_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 245 |
+
"text_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 246 |
+
"text_model.model.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 247 |
+
"text_model.model.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 248 |
+
"text_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 249 |
+
"text_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 250 |
+
"text_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 251 |
+
"text_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 252 |
+
"text_model.model.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 253 |
+
"text_model.model.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 254 |
+
"text_model.model.layers.20.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 255 |
+
"text_model.model.layers.20.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 256 |
+
"text_model.model.layers.20.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 257 |
+
"text_model.model.layers.20.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 258 |
+
"text_model.model.layers.20.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 259 |
+
"text_model.model.layers.20.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 260 |
+
"text_model.model.layers.20.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 261 |
+
"text_model.model.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 262 |
+
"text_model.model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 263 |
+
"text_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 264 |
+
"text_model.model.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 265 |
+
"text_model.model.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 266 |
+
"text_model.model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 267 |
+
"text_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 268 |
+
"text_model.model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 269 |
+
"text_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 270 |
+
"text_model.model.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 271 |
+
"text_model.model.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 272 |
+
"text_model.model.layers.21.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 273 |
+
"text_model.model.layers.21.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 274 |
+
"text_model.model.layers.21.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 275 |
+
"text_model.model.layers.21.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 276 |
+
"text_model.model.layers.21.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 277 |
+
"text_model.model.layers.21.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 278 |
+
"text_model.model.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 279 |
+
"text_model.model.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 280 |
+
"text_model.model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 281 |
+
"text_model.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 282 |
+
"text_model.model.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 283 |
+
"text_model.model.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 284 |
+
"text_model.model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 285 |
+
"text_model.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 286 |
+
"text_model.model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 287 |
+
"text_model.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 288 |
+
"text_model.model.layers.21.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 289 |
+
"text_model.model.layers.21.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 290 |
+
"text_model.model.layers.22.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 291 |
+
"text_model.model.layers.22.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 292 |
+
"text_model.model.layers.22.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 293 |
+
"text_model.model.layers.22.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 294 |
+
"text_model.model.layers.22.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 295 |
+
"text_model.model.layers.22.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 296 |
+
"text_model.model.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 297 |
+
"text_model.model.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 298 |
+
"text_model.model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 299 |
+
"text_model.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 300 |
+
"text_model.model.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 301 |
+
"text_model.model.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 302 |
+
"text_model.model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 303 |
+
"text_model.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 304 |
+
"text_model.model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 305 |
+
"text_model.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 306 |
+
"text_model.model.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 307 |
+
"text_model.model.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 308 |
+
"text_model.model.layers.23.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 309 |
+
"text_model.model.layers.23.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 310 |
+
"text_model.model.layers.23.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 311 |
+
"text_model.model.layers.23.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 312 |
+
"text_model.model.layers.23.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 313 |
+
"text_model.model.layers.23.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 314 |
+
"text_model.model.layers.23.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 315 |
+
"text_model.model.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 316 |
+
"text_model.model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 317 |
+
"text_model.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 318 |
+
"text_model.model.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 319 |
+
"text_model.model.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 320 |
+
"text_model.model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 321 |
+
"text_model.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 322 |
+
"text_model.model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 323 |
+
"text_model.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 324 |
+
"text_model.model.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 325 |
+
"text_model.model.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 326 |
+
"text_model.model.layers.3.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 327 |
+
"text_model.model.layers.3.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 328 |
+
"text_model.model.layers.3.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 329 |
+
"text_model.model.layers.3.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 330 |
+
"text_model.model.layers.3.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 331 |
+
"text_model.model.layers.3.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 332 |
+
"text_model.model.layers.3.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 333 |
+
"text_model.model.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 334 |
+
"text_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 335 |
+
"text_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 336 |
+
"text_model.model.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 337 |
+
"text_model.model.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 338 |
+
"text_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 339 |
+
"text_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 340 |
+
"text_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 341 |
+
"text_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 342 |
+
"text_model.model.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 343 |
+
"text_model.model.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 344 |
+
"text_model.model.layers.4.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 345 |
+
"text_model.model.layers.4.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 346 |
+
"text_model.model.layers.4.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 347 |
+
"text_model.model.layers.4.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 348 |
+
"text_model.model.layers.4.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 349 |
+
"text_model.model.layers.4.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 350 |
+
"text_model.model.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 351 |
+
"text_model.model.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 352 |
+
"text_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 353 |
+
"text_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 354 |
+
"text_model.model.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 355 |
+
"text_model.model.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 356 |
+
"text_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 357 |
+
"text_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 358 |
+
"text_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 359 |
+
"text_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 360 |
+
"text_model.model.layers.4.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 361 |
+
"text_model.model.layers.4.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 362 |
+
"text_model.model.layers.5.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 363 |
+
"text_model.model.layers.5.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 364 |
+
"text_model.model.layers.5.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 365 |
+
"text_model.model.layers.5.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 366 |
+
"text_model.model.layers.5.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 367 |
+
"text_model.model.layers.5.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 368 |
+
"text_model.model.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 369 |
+
"text_model.model.layers.5.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 370 |
+
"text_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 371 |
+
"text_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 372 |
+
"text_model.model.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 373 |
+
"text_model.model.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 374 |
+
"text_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 375 |
+
"text_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 376 |
+
"text_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 377 |
+
"text_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 378 |
+
"text_model.model.layers.5.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 379 |
+
"text_model.model.layers.5.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 380 |
+
"text_model.model.layers.6.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 381 |
+
"text_model.model.layers.6.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 382 |
+
"text_model.model.layers.6.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 383 |
+
"text_model.model.layers.6.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 384 |
+
"text_model.model.layers.6.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 385 |
+
"text_model.model.layers.6.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 386 |
+
"text_model.model.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 387 |
+
"text_model.model.layers.6.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 388 |
+
"text_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 389 |
+
"text_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 390 |
+
"text_model.model.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 391 |
+
"text_model.model.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 392 |
+
"text_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 393 |
+
"text_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 394 |
+
"text_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 395 |
+
"text_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 396 |
+
"text_model.model.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 397 |
+
"text_model.model.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 398 |
+
"text_model.model.layers.7.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 399 |
+
"text_model.model.layers.7.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 400 |
+
"text_model.model.layers.7.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 401 |
+
"text_model.model.layers.7.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 402 |
+
"text_model.model.layers.7.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 403 |
+
"text_model.model.layers.7.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 404 |
+
"text_model.model.layers.7.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 405 |
+
"text_model.model.layers.7.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 406 |
+
"text_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 407 |
+
"text_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 408 |
+
"text_model.model.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 409 |
+
"text_model.model.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 410 |
+
"text_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 411 |
+
"text_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 412 |
+
"text_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 413 |
+
"text_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 414 |
+
"text_model.model.layers.7.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 415 |
+
"text_model.model.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 416 |
+
"text_model.model.layers.8.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 417 |
+
"text_model.model.layers.8.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 418 |
+
"text_model.model.layers.8.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 419 |
+
"text_model.model.layers.8.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 420 |
+
"text_model.model.layers.8.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 421 |
+
"text_model.model.layers.8.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 422 |
+
"text_model.model.layers.8.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 423 |
+
"text_model.model.layers.8.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 424 |
+
"text_model.model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 425 |
+
"text_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 426 |
+
"text_model.model.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 427 |
+
"text_model.model.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 428 |
+
"text_model.model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 429 |
+
"text_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 430 |
+
"text_model.model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 431 |
+
"text_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 432 |
+
"text_model.model.layers.8.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 433 |
+
"text_model.model.layers.8.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 434 |
+
"text_model.model.layers.9.ffn.fc1.bias": "model-00001-of-00002.safetensors",
|
| 435 |
+
"text_model.model.layers.9.ffn.fc1.weight": "model-00001-of-00002.safetensors",
|
| 436 |
+
"text_model.model.layers.9.ffn.fc2.bias": "model-00001-of-00002.safetensors",
|
| 437 |
+
"text_model.model.layers.9.ffn.fc2.weight": "model-00001-of-00002.safetensors",
|
| 438 |
+
"text_model.model.layers.9.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 439 |
+
"text_model.model.layers.9.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 440 |
+
"text_model.model.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 441 |
+
"text_model.model.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 442 |
+
"text_model.model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 443 |
+
"text_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 444 |
+
"text_model.model.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 445 |
+
"text_model.model.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 446 |
+
"text_model.model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 447 |
+
"text_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 448 |
+
"text_model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 449 |
+
"text_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 450 |
+
"text_model.model.layers.9.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
|
| 451 |
+
"text_model.model.layers.9.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 452 |
+
"text_model.model.segment_emb.weight": "model-00001-of-00002.safetensors",
|
| 453 |
+
"vision_model.embeddings.column_embedder.weight": "model-00001-of-00002.safetensors",
|
| 454 |
+
"vision_model.embeddings.patch_projection.bias": "model-00001-of-00002.safetensors",
|
| 455 |
+
"vision_model.embeddings.patch_projection.weight": "model-00001-of-00002.safetensors",
|
| 456 |
+
"vision_model.embeddings.row_embedder.weight": "model-00001-of-00002.safetensors",
|
| 457 |
+
"vision_model.encoder.layer.0.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 458 |
+
"vision_model.encoder.layer.0.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 459 |
+
"vision_model.encoder.layer.0.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 460 |
+
"vision_model.encoder.layer.0.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 461 |
+
"vision_model.encoder.layer.0.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 462 |
+
"vision_model.encoder.layer.0.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 463 |
+
"vision_model.encoder.layer.0.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 464 |
+
"vision_model.encoder.layer.0.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 465 |
+
"vision_model.encoder.layer.0.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 466 |
+
"vision_model.encoder.layer.1.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 467 |
+
"vision_model.encoder.layer.1.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 468 |
+
"vision_model.encoder.layer.1.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 469 |
+
"vision_model.encoder.layer.1.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 470 |
+
"vision_model.encoder.layer.1.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 471 |
+
"vision_model.encoder.layer.1.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 472 |
+
"vision_model.encoder.layer.1.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 473 |
+
"vision_model.encoder.layer.1.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 474 |
+
"vision_model.encoder.layer.1.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 475 |
+
"vision_model.encoder.layer.10.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 476 |
+
"vision_model.encoder.layer.10.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 477 |
+
"vision_model.encoder.layer.10.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 478 |
+
"vision_model.encoder.layer.10.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 479 |
+
"vision_model.encoder.layer.10.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 480 |
+
"vision_model.encoder.layer.10.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 481 |
+
"vision_model.encoder.layer.10.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 482 |
+
"vision_model.encoder.layer.10.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 483 |
+
"vision_model.encoder.layer.10.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 484 |
+
"vision_model.encoder.layer.11.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 485 |
+
"vision_model.encoder.layer.11.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 486 |
+
"vision_model.encoder.layer.11.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 487 |
+
"vision_model.encoder.layer.11.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 488 |
+
"vision_model.encoder.layer.11.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 489 |
+
"vision_model.encoder.layer.11.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 490 |
+
"vision_model.encoder.layer.11.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 491 |
+
"vision_model.encoder.layer.11.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 492 |
+
"vision_model.encoder.layer.11.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 493 |
+
"vision_model.encoder.layer.12.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 494 |
+
"vision_model.encoder.layer.12.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 495 |
+
"vision_model.encoder.layer.12.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 496 |
+
"vision_model.encoder.layer.12.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 497 |
+
"vision_model.encoder.layer.12.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 498 |
+
"vision_model.encoder.layer.12.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 499 |
+
"vision_model.encoder.layer.12.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 500 |
+
"vision_model.encoder.layer.12.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 501 |
+
"vision_model.encoder.layer.12.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 502 |
+
"vision_model.encoder.layer.13.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 503 |
+
"vision_model.encoder.layer.13.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 504 |
+
"vision_model.encoder.layer.13.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 505 |
+
"vision_model.encoder.layer.13.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 506 |
+
"vision_model.encoder.layer.13.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 507 |
+
"vision_model.encoder.layer.13.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 508 |
+
"vision_model.encoder.layer.13.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 509 |
+
"vision_model.encoder.layer.13.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 510 |
+
"vision_model.encoder.layer.13.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 511 |
+
"vision_model.encoder.layer.14.attention.key.weight": "model-00002-of-00002.safetensors",
|
| 512 |
+
"vision_model.encoder.layer.14.attention.output.weight": "model-00002-of-00002.safetensors",
|
| 513 |
+
"vision_model.encoder.layer.14.attention.query.weight": "model-00002-of-00002.safetensors",
|
| 514 |
+
"vision_model.encoder.layer.14.attention.value.weight": "model-00002-of-00002.safetensors",
|
| 515 |
+
"vision_model.encoder.layer.14.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
|
| 516 |
+
"vision_model.encoder.layer.14.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
|
| 517 |
+
"vision_model.encoder.layer.14.mlp.wo.weight": "model-00002-of-00002.safetensors",
|
| 518 |
+
"vision_model.encoder.layer.14.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 519 |
+
"vision_model.encoder.layer.14.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 520 |
+
"vision_model.encoder.layer.15.attention.key.weight": "model-00002-of-00002.safetensors",
|
| 521 |
+
"vision_model.encoder.layer.15.attention.output.weight": "model-00002-of-00002.safetensors",
|
| 522 |
+
"vision_model.encoder.layer.15.attention.query.weight": "model-00002-of-00002.safetensors",
|
| 523 |
+
"vision_model.encoder.layer.15.attention.value.weight": "model-00002-of-00002.safetensors",
|
| 524 |
+
"vision_model.encoder.layer.15.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
|
| 525 |
+
"vision_model.encoder.layer.15.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
|
| 526 |
+
"vision_model.encoder.layer.15.mlp.wo.weight": "model-00002-of-00002.safetensors",
|
| 527 |
+
"vision_model.encoder.layer.15.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 528 |
+
"vision_model.encoder.layer.15.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 529 |
+
"vision_model.encoder.layer.16.attention.key.weight": "model-00002-of-00002.safetensors",
|
| 530 |
+
"vision_model.encoder.layer.16.attention.output.weight": "model-00002-of-00002.safetensors",
|
| 531 |
+
"vision_model.encoder.layer.16.attention.query.weight": "model-00002-of-00002.safetensors",
|
| 532 |
+
"vision_model.encoder.layer.16.attention.value.weight": "model-00002-of-00002.safetensors",
|
| 533 |
+
"vision_model.encoder.layer.16.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
|
| 534 |
+
"vision_model.encoder.layer.16.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
|
| 535 |
+
"vision_model.encoder.layer.16.mlp.wo.weight": "model-00002-of-00002.safetensors",
|
| 536 |
+
"vision_model.encoder.layer.16.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 537 |
+
"vision_model.encoder.layer.16.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 538 |
+
"vision_model.encoder.layer.17.attention.key.weight": "model-00002-of-00002.safetensors",
|
| 539 |
+
"vision_model.encoder.layer.17.attention.output.weight": "model-00002-of-00002.safetensors",
|
| 540 |
+
"vision_model.encoder.layer.17.attention.query.weight": "model-00002-of-00002.safetensors",
|
| 541 |
+
"vision_model.encoder.layer.17.attention.value.weight": "model-00002-of-00002.safetensors",
|
| 542 |
+
"vision_model.encoder.layer.17.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
|
| 543 |
+
"vision_model.encoder.layer.17.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
|
| 544 |
+
"vision_model.encoder.layer.17.mlp.wo.weight": "model-00002-of-00002.safetensors",
|
| 545 |
+
"vision_model.encoder.layer.17.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 546 |
+
"vision_model.encoder.layer.17.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
|
| 547 |
+
"vision_model.encoder.layer.2.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 548 |
+
"vision_model.encoder.layer.2.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 549 |
+
"vision_model.encoder.layer.2.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 550 |
+
"vision_model.encoder.layer.2.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 551 |
+
"vision_model.encoder.layer.2.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 552 |
+
"vision_model.encoder.layer.2.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 553 |
+
"vision_model.encoder.layer.2.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 554 |
+
"vision_model.encoder.layer.2.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 555 |
+
"vision_model.encoder.layer.2.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 556 |
+
"vision_model.encoder.layer.3.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 557 |
+
"vision_model.encoder.layer.3.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 558 |
+
"vision_model.encoder.layer.3.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 559 |
+
"vision_model.encoder.layer.3.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 560 |
+
"vision_model.encoder.layer.3.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 561 |
+
"vision_model.encoder.layer.3.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 562 |
+
"vision_model.encoder.layer.3.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 563 |
+
"vision_model.encoder.layer.3.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 564 |
+
"vision_model.encoder.layer.3.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 565 |
+
"vision_model.encoder.layer.4.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 566 |
+
"vision_model.encoder.layer.4.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 567 |
+
"vision_model.encoder.layer.4.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 568 |
+
"vision_model.encoder.layer.4.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 569 |
+
"vision_model.encoder.layer.4.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 570 |
+
"vision_model.encoder.layer.4.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 571 |
+
"vision_model.encoder.layer.4.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 572 |
+
"vision_model.encoder.layer.4.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 573 |
+
"vision_model.encoder.layer.4.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 574 |
+
"vision_model.encoder.layer.5.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 575 |
+
"vision_model.encoder.layer.5.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 576 |
+
"vision_model.encoder.layer.5.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 577 |
+
"vision_model.encoder.layer.5.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 578 |
+
"vision_model.encoder.layer.5.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 579 |
+
"vision_model.encoder.layer.5.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 580 |
+
"vision_model.encoder.layer.5.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 581 |
+
"vision_model.encoder.layer.5.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 582 |
+
"vision_model.encoder.layer.5.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 583 |
+
"vision_model.encoder.layer.6.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 584 |
+
"vision_model.encoder.layer.6.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 585 |
+
"vision_model.encoder.layer.6.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 586 |
+
"vision_model.encoder.layer.6.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 587 |
+
"vision_model.encoder.layer.6.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 588 |
+
"vision_model.encoder.layer.6.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 589 |
+
"vision_model.encoder.layer.6.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 590 |
+
"vision_model.encoder.layer.6.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 591 |
+
"vision_model.encoder.layer.6.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 592 |
+
"vision_model.encoder.layer.7.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 593 |
+
"vision_model.encoder.layer.7.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 594 |
+
"vision_model.encoder.layer.7.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 595 |
+
"vision_model.encoder.layer.7.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 596 |
+
"vision_model.encoder.layer.7.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 597 |
+
"vision_model.encoder.layer.7.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 598 |
+
"vision_model.encoder.layer.7.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 599 |
+
"vision_model.encoder.layer.7.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 600 |
+
"vision_model.encoder.layer.7.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 601 |
+
"vision_model.encoder.layer.8.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 602 |
+
"vision_model.encoder.layer.8.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 603 |
+
"vision_model.encoder.layer.8.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 604 |
+
"vision_model.encoder.layer.8.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 605 |
+
"vision_model.encoder.layer.8.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 606 |
+
"vision_model.encoder.layer.8.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 607 |
+
"vision_model.encoder.layer.8.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 608 |
+
"vision_model.encoder.layer.8.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 609 |
+
"vision_model.encoder.layer.8.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 610 |
+
"vision_model.encoder.layer.9.attention.key.weight": "model-00001-of-00002.safetensors",
|
| 611 |
+
"vision_model.encoder.layer.9.attention.output.weight": "model-00001-of-00002.safetensors",
|
| 612 |
+
"vision_model.encoder.layer.9.attention.query.weight": "model-00001-of-00002.safetensors",
|
| 613 |
+
"vision_model.encoder.layer.9.attention.value.weight": "model-00001-of-00002.safetensors",
|
| 614 |
+
"vision_model.encoder.layer.9.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
|
| 615 |
+
"vision_model.encoder.layer.9.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
|
| 616 |
+
"vision_model.encoder.layer.9.mlp.wo.weight": "model-00001-of-00002.safetensors",
|
| 617 |
+
"vision_model.encoder.layer.9.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 618 |
+
"vision_model.encoder.layer.9.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
|
| 619 |
+
"vision_model.layernorm.weight": "model-00002-of-00002.safetensors"
|
| 620 |
+
}
|
| 621 |
+
}
|
models/kosmos-2.5-chat/preprocessor_config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_processor_type": "Kosmos2_5ImageProcessor",
|
| 3 |
+
"processor_class": "Kosmos2_5Processor"
|
| 4 |
+
}
|
| 5 |
+
|
models/kosmos-2.5-chat/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/microsoft/kosmos-2.5-chat
|
models/kosmos-2.5-chat/special_tokens_map.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"boi_token": "<image>",
|
| 3 |
+
"bos_token": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
},
|
| 10 |
+
"eoi_token": "</image>",
|
| 11 |
+
"eos_token": {
|
| 12 |
+
"content": "</s>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false
|
| 17 |
+
},
|
| 18 |
+
"image_token": "<s>",
|
| 19 |
+
"pad_token": {
|
| 20 |
+
"content": "<pad>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false
|
| 25 |
+
},
|
| 26 |
+
"unk_token": {
|
| 27 |
+
"content": "<unk>",
|
| 28 |
+
"lstrip": false,
|
| 29 |
+
"normalized": false,
|
| 30 |
+
"rstrip": false,
|
| 31 |
+
"single_word": false
|
| 32 |
+
}
|
| 33 |
+
}
|
models/kosmos-2.5-chat/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/kosmos-2.5-chat/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/kosmos-2.5-ft/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/kosmos-2.5-ft/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- merve/doclaynet-small
|
| 5 |
+
base_model:
|
| 6 |
+
- microsoft/kosmos-2.5
|
| 7 |
+
pipeline_tag: image-text-to-text
|
| 8 |
+
library_name: transformers
|
| 9 |
+
tags:
|
| 10 |
+
- ocr
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Kosmos-2.5 Fine-tuned on DocLayNet
|
| 14 |
+
|
| 15 |
+
Kosmos-2.5 fine-tuned on grounded OCR (OCR with bounding boxes), find the script here: ([GH](https://github.com/merveenoyan/smol-vision/blob/main/Grounded_Fine_tuning%20GH.ipynb), [HF](https://huggingface.co/merve/smol-vision/blob/main/Grounded_Fine_tuning.ipynb))
|
| 16 |
+
|
| 17 |
+
Try the (base model) Kosmos-2.5 demo [here](https://huggingface.co/spaces/nielsr/kosmos-2.5-demo).
|
| 18 |
+
|
| 19 |
+
Here's the inference code:
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
|
| 23 |
+
import torch
|
| 24 |
+
|
| 25 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
|
| 26 |
+
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
|
| 27 |
+
|
| 28 |
+
import requests
|
| 29 |
+
from PIL import Image
|
| 30 |
+
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
|
| 31 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
import re
|
| 35 |
+
prompt = "<ocr>"
|
| 36 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 37 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 38 |
+
raw_width, raw_height = image.size
|
| 39 |
+
scale_height = raw_height / height
|
| 40 |
+
scale_width = raw_width / width
|
| 41 |
+
|
| 42 |
+
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
|
| 43 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
|
| 44 |
+
|
| 45 |
+
generated_ids = model.generate(
|
| 46 |
+
**inputs,
|
| 47 |
+
max_new_tokens=2000,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 51 |
+
|
| 52 |
+
import re
|
| 53 |
+
from PIL import ImageDraw
|
| 54 |
+
|
| 55 |
+
def post_process(y, scale_height, scale_width):
|
| 56 |
+
|
| 57 |
+
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
|
| 58 |
+
bboxes_raw = re.findall(pattern, y)
|
| 59 |
+
lines = re.split(pattern, y)[1:]
|
| 60 |
+
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
|
| 61 |
+
|
| 62 |
+
out_lines = []
|
| 63 |
+
for i, box in enumerate(bboxes):
|
| 64 |
+
if len(box) != 4:
|
| 65 |
+
continue
|
| 66 |
+
x0, y0, x1, y1 = box
|
| 67 |
+
|
| 68 |
+
if x0 >= x1 or y0 >= y1:
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
sx0 = int(x0 * scale_width)
|
| 72 |
+
sy0 = int(y0 * scale_height)
|
| 73 |
+
sx1 = int(x1 * scale_width)
|
| 74 |
+
sy1 = int(y1 * scale_height)
|
| 75 |
+
|
| 76 |
+
label = lines[i] if i < len(lines) else ""
|
| 77 |
+
label = label.lstrip(", ").strip()
|
| 78 |
+
|
| 79 |
+
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
|
| 80 |
+
|
| 81 |
+
return "\n".join(out_lines)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
output_text = post_process(generated_text[0], scale_height, scale_width)
|
| 85 |
+
print(output_text)
|
| 86 |
+
|
| 87 |
+
draw = ImageDraw.Draw(image)
|
| 88 |
+
|
| 89 |
+
for line in output_text.strip().splitlines():
|
| 90 |
+
coords = re.findall(r"-?\d+", line)[:8]
|
| 91 |
+
if len(coords) < 8:
|
| 92 |
+
continue
|
| 93 |
+
xy = list(map(int, coords))
|
| 94 |
+
draw.polygon(xy, outline="red")
|
| 95 |
+
|
| 96 |
+
image.save("output.png")
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
The image and the text (shortened here) output:
|
| 100 |
+
```
|
| 101 |
+
338,17,673,17,673,82,338,82,CONFIDENTIAL
|
| 102 |
+
445,68,478,68,478,97,445,97,-2-
|
| 103 |
+
169,129,193,129,193,157,169,157,6.
|
| 104 |
+
334,129,910,129,910,157,334,157,A suggestion that the light could have been produced by
|
| 105 |
+
169,150,900,150,900,177,169,177,a photo-flash from a high-flying aircraft was discounted. No aircraft
|
| 106 |
+
166,171,856,171,856,198,166,198,was heard at the time and, in any case, no known photo-flash has a
|
| 107 |
+
...
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+

|
| 111 |
+
|
models/kosmos-2.5-ft/config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Kosmos2_5ForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"dtype": "bfloat16",
|
| 6 |
+
"latent_query_num": 2048,
|
| 7 |
+
"model_type": "kosmos-2.5",
|
| 8 |
+
"text_config": {
|
| 9 |
+
"activation_dropout": 0.0,
|
| 10 |
+
"activation_function": "gelu",
|
| 11 |
+
"attention_dropout": 0.0,
|
| 12 |
+
"attention_heads": 16,
|
| 13 |
+
"dropout": 0,
|
| 14 |
+
"dtype": "bfloat16",
|
| 15 |
+
"embed_dim": 1536,
|
| 16 |
+
"ffn_dim": 6144,
|
| 17 |
+
"init_std": 0.02,
|
| 18 |
+
"layer_norm_eps": 1e-05,
|
| 19 |
+
"layerdrop": 0.0,
|
| 20 |
+
"layers": 24,
|
| 21 |
+
"max_position_embeddings": 4096,
|
| 22 |
+
"model_type": "kosmos_2_5_text_model",
|
| 23 |
+
"scale_embedding": true,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"vocab_size": 108481
|
| 26 |
+
},
|
| 27 |
+
"transformers_version": "4.56.1",
|
| 28 |
+
"vision_config": {
|
| 29 |
+
"attention_dropout": 0.0,
|
| 30 |
+
"dense_act_fn": "gelu_new",
|
| 31 |
+
"dropout_rate": 0.0,
|
| 32 |
+
"dtype": "bfloat16",
|
| 33 |
+
"head_dim": 64,
|
| 34 |
+
"hidden_size": 1536,
|
| 35 |
+
"initializer_factor": 1.0,
|
| 36 |
+
"initializer_range": 1e-10,
|
| 37 |
+
"intermediate_size": 3968,
|
| 38 |
+
"layer_norm_eps": 1e-06,
|
| 39 |
+
"max_length": 4096,
|
| 40 |
+
"max_num_patches": 4096,
|
| 41 |
+
"model_type": "kosmos_2_5_vision_model",
|
| 42 |
+
"num_attention_heads": 24,
|
| 43 |
+
"num_hidden_layers": 18,
|
| 44 |
+
"patch_embed_hidden_size": 768
|
| 45 |
+
}
|
| 46 |
+
}
|
models/kosmos-2.5-ft/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 0,
|
| 3 |
+
"eos_token_id": 2,
|
| 4 |
+
"num_beam": 1,
|
| 5 |
+
"pad_token_id": 1,
|
| 6 |
+
"transformers_version": "4.56.1"
|
| 7 |
+
}
|
models/kosmos-2.5-ft/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b51c3ece1fdebe5dcc63a6079ddbae5a3a8f565ad054600b406a11eaa0fc768d
|
| 3 |
+
size 2749368352
|
models/kosmos-2.5-ft/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72903c0342414f6579137b6bf964247c48f931461f653622ceeb02a716dd60c1
|
| 3 |
+
size 5499116125
|
models/kosmos-2.5-ft/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a794a6cb9cd4bbd0c53d08db0e20a5536c789bba6f22113385c1c408d58908bd
|
| 3 |
+
size 14645
|
models/kosmos-2.5-ft/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d3e048a1e5c9dc7581e3872c3b16feadec0e02e34c6509590158830c91d1422
|
| 3 |
+
size 1465
|
models/kosmos-2.5-ft/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/merve/kosmos-2.5-ft
|
models/kosmos-2.5-ft/trainer_state.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1126,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.17777777777777778,
|
| 14 |
+
"grad_norm": 1.1875,
|
| 15 |
+
"learning_rate": 1.827402135231317e-05,
|
| 16 |
+
"loss": 0.4486,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.35555555555555557,
|
| 21 |
+
"grad_norm": 1.21875,
|
| 22 |
+
"learning_rate": 1.6494661921708185e-05,
|
| 23 |
+
"loss": 0.1052,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.5333333333333333,
|
| 28 |
+
"grad_norm": 0.90625,
|
| 29 |
+
"learning_rate": 1.4715302491103204e-05,
|
| 30 |
+
"loss": 0.0932,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.7111111111111111,
|
| 35 |
+
"grad_norm": 1.3984375,
|
| 36 |
+
"learning_rate": 1.2935943060498222e-05,
|
| 37 |
+
"loss": 0.0988,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.8888888888888888,
|
| 42 |
+
"grad_norm": 1.5703125,
|
| 43 |
+
"learning_rate": 1.1156583629893238e-05,
|
| 44 |
+
"loss": 0.0903,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 1.0657777777777777,
|
| 49 |
+
"grad_norm": 1.265625,
|
| 50 |
+
"learning_rate": 9.377224199288258e-06,
|
| 51 |
+
"loss": 0.0912,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 1.2435555555555555,
|
| 56 |
+
"grad_norm": 0.65234375,
|
| 57 |
+
"learning_rate": 7.597864768683275e-06,
|
| 58 |
+
"loss": 0.0798,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 1.4213333333333333,
|
| 63 |
+
"grad_norm": 1.890625,
|
| 64 |
+
"learning_rate": 5.818505338078292e-06,
|
| 65 |
+
"loss": 0.0835,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 1.5991111111111111,
|
| 70 |
+
"grad_norm": 2.015625,
|
| 71 |
+
"learning_rate": 4.03914590747331e-06,
|
| 72 |
+
"loss": 0.0785,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 1.7768888888888887,
|
| 77 |
+
"grad_norm": 1.546875,
|
| 78 |
+
"learning_rate": 2.2597864768683274e-06,
|
| 79 |
+
"loss": 0.0785,
|
| 80 |
+
"step": 1000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.9546666666666668,
|
| 84 |
+
"grad_norm": 2.15625,
|
| 85 |
+
"learning_rate": 4.804270462633452e-07,
|
| 86 |
+
"loss": 0.079,
|
| 87 |
+
"step": 1100
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"logging_steps": 100,
|
| 91 |
+
"max_steps": 1126,
|
| 92 |
+
"num_input_tokens_seen": 0,
|
| 93 |
+
"num_train_epochs": 2,
|
| 94 |
+
"save_steps": 1000,
|
| 95 |
+
"stateful_callbacks": {
|
| 96 |
+
"TrainerControl": {
|
| 97 |
+
"args": {
|
| 98 |
+
"should_epoch_stop": false,
|
| 99 |
+
"should_evaluate": false,
|
| 100 |
+
"should_log": false,
|
| 101 |
+
"should_save": true,
|
| 102 |
+
"should_training_stop": true
|
| 103 |
+
},
|
| 104 |
+
"attributes": {}
|
| 105 |
+
}
|
| 106 |
+
},
|
| 107 |
+
"total_flos": 1.1068310033650483e+17,
|
| 108 |
+
"train_batch_size": 1,
|
| 109 |
+
"trial_name": null,
|
| 110 |
+
"trial_params": null
|
| 111 |
+
}
|
models/kosmos-2.5-ft/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:285a70ced9abc4734407c103d93dd57bdfb7ce4329159887d23b785dbb4645a4
|
| 3 |
+
size 5777
|
models/kosmos-2.5/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
output.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
receipt_00008.png filter=lfs diff=lfs merge=lfs -text
|
models/kosmos-2.5/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
license: mit
|
| 4 |
+
library_name: transformers
|
| 5 |
+
pipeline_tag: image-text-to-text
|
| 6 |
+
---
|
| 7 |
+
# Kosmos-2.5
|
| 8 |
+
|
| 9 |
+
[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
|
| 10 |
+
|
| 11 |
+
## Model description
|
| 12 |
+
|
| 13 |
+
Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
|
| 14 |
+
|
| 15 |
+
[Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
|
| 16 |
+
|
| 17 |
+
## NOTE:
|
| 18 |
+
Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
|
| 19 |
+
|
| 20 |
+
## Inference
|
| 21 |
+
|
| 22 |
+
KOSMOS-2.5 is supported from Transformers >= 4.56. Find the docs [here](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5).
|
| 23 |
+
|
| 24 |
+
**Markdown Task:** For usage instructions, please refer to [md.py](md.py).
|
| 25 |
+
|
| 26 |
+
```py
|
| 27 |
+
import re
|
| 28 |
+
import torch
|
| 29 |
+
import requests
|
| 30 |
+
from PIL import Image, ImageDraw
|
| 31 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
|
| 32 |
+
|
| 33 |
+
repo = "microsoft/kosmos-2.5"
|
| 34 |
+
device = "cuda:0"
|
| 35 |
+
dtype = torch.bfloat16
|
| 36 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
|
| 37 |
+
processor = AutoProcessor.from_pretrained(repo)
|
| 38 |
+
|
| 39 |
+
# sample image
|
| 40 |
+
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
| 41 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 42 |
+
|
| 43 |
+
prompt = "<md>"
|
| 44 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 45 |
+
|
| 46 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 47 |
+
raw_width, raw_height = image.size
|
| 48 |
+
scale_height = raw_height / height
|
| 49 |
+
scale_width = raw_width / width
|
| 50 |
+
|
| 51 |
+
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 52 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 53 |
+
generated_ids = model.generate(
|
| 54 |
+
**inputs,
|
| 55 |
+
max_new_tokens=1024,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 59 |
+
print(generated_text[0])
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
**OCR Task:** For usage instructions, please refer to [ocr.py](ocr.py).
|
| 63 |
+
|
| 64 |
+
```py
|
| 65 |
+
import re
|
| 66 |
+
import torch
|
| 67 |
+
import requests
|
| 68 |
+
from PIL import Image, ImageDraw
|
| 69 |
+
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
|
| 70 |
+
|
| 71 |
+
repo = "microsoft/kosmos-2.5"
|
| 72 |
+
device = "cuda:0"
|
| 73 |
+
dtype = torch.bfloat16
|
| 74 |
+
model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
|
| 75 |
+
processor = AutoProcessor.from_pretrained(repo)
|
| 76 |
+
|
| 77 |
+
# sample image
|
| 78 |
+
url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
|
| 79 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 80 |
+
|
| 81 |
+
# bs = 1
|
| 82 |
+
prompt = "<ocr>"
|
| 83 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
| 84 |
+
height, width = inputs.pop("height"), inputs.pop("width")
|
| 85 |
+
raw_width, raw_height = image.size
|
| 86 |
+
scale_height = raw_height / height
|
| 87 |
+
scale_width = raw_width / width
|
| 88 |
+
|
| 89 |
+
# bs > 1, batch generation
|
| 90 |
+
# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
|
| 91 |
+
# height, width = inputs.pop("height"), inputs.pop("width")
|
| 92 |
+
# raw_width, raw_height = image.size
|
| 93 |
+
# scale_height = raw_height / height[0]
|
| 94 |
+
# scale_width = raw_width / width[0]
|
| 95 |
+
|
| 96 |
+
inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
|
| 97 |
+
inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
|
| 98 |
+
generated_ids = model.generate(
|
| 99 |
+
**inputs,
|
| 100 |
+
max_new_tokens=1024,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 104 |
+
def post_process(y, scale_height, scale_width):
|
| 105 |
+
y = y.replace(prompt, "")
|
| 106 |
+
if "<md>" in prompt:
|
| 107 |
+
return y
|
| 108 |
+
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
|
| 109 |
+
bboxs_raw = re.findall(pattern, y)
|
| 110 |
+
lines = re.split(pattern, y)[1:]
|
| 111 |
+
bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
|
| 112 |
+
bboxs = [[int(j) for j in i] for i in bboxs]
|
| 113 |
+
info = ""
|
| 114 |
+
for i in range(len(lines)):
|
| 115 |
+
box = bboxs[i]
|
| 116 |
+
x0, y0, x1, y1 = box
|
| 117 |
+
if not (x0 >= x1 or y0 >= y1):
|
| 118 |
+
x0 = int(x0 * scale_width)
|
| 119 |
+
y0 = int(y0 * scale_height)
|
| 120 |
+
x1 = int(x1 * scale_width)
|
| 121 |
+
y1 = int(y1 * scale_height)
|
| 122 |
+
info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
|
| 123 |
+
return info
|
| 124 |
+
|
| 125 |
+
output_text = post_process(generated_text[0], scale_height, scale_width)
|
| 126 |
+
print(output_text)
|
| 127 |
+
|
| 128 |
+
draw = ImageDraw.Draw(image)
|
| 129 |
+
lines = output_text.split("\n")
|
| 130 |
+
for line in lines:
|
| 131 |
+
# draw the bounding box
|
| 132 |
+
line = list(line.split(","))
|
| 133 |
+
if len(line) < 8:
|
| 134 |
+
continue
|
| 135 |
+
line = list(map(int, line[:8]))
|
| 136 |
+
draw.polygon(line, outline="red")
|
| 137 |
+
image.save("output.png")
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
## Citation
|
| 141 |
+
|
| 142 |
+
If you find Kosmos-2.5 useful in your research, please cite the following paper:
|
| 143 |
+
|
| 144 |
+
```
|
| 145 |
+
@article{lv2023kosmos,
|
| 146 |
+
title={Kosmos-2.5: A multimodal literate model},
|
| 147 |
+
author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
|
| 148 |
+
journal={arXiv preprint arXiv:2309.11419},
|
| 149 |
+
year={2023}
|
| 150 |
+
}
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
## License
|
| 154 |
+
The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
|
| 155 |
+
|
| 156 |
+
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
models/kosmos-2.5/ckpt.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:676d0efd1dddf3785644918dd598d7734f9ed6e3eb59f806299ca8b7aefa0967
|
| 3 |
+
size 6165757107
|