Update README.md
Browse files
README.md
CHANGED
|
@@ -10,177 +10,6 @@ tags:
|
|
| 10 |
license: mit
|
| 11 |
library_name: transformers
|
| 12 |
---
|
| 13 |
-
|
| 14 |
-
<img src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/logo.svg?raw=true" width="60%" alt="DeepSeek AI" />
|
| 15 |
-
</div>
|
| 16 |
-
<hr>
|
| 17 |
-
<div align="center">
|
| 18 |
-
<a href="https://www.deepseek.com/" target="_blank">
|
| 19 |
-
<img alt="Homepage" src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/badge.svg?raw=true" />
|
| 20 |
-
</a>
|
| 21 |
-
<a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR" target="_blank">
|
| 22 |
-
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
|
| 23 |
-
</a>
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
<div align="center">
|
| 28 |
-
|
| 29 |
-
<a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
|
| 30 |
-
<img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
|
| 31 |
-
</a>
|
| 32 |
-
<a href="https://twitter.com/deepseek_ai" target="_blank">
|
| 33 |
-
<img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
|
| 34 |
-
</a>
|
| 35 |
-
|
| 36 |
-
</div>
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
<p align="center">
|
| 41 |
-
<a href="https://github.com/deepseek-ai/DeepSeek-OCR"><b>🌟 Github</b></a> |
|
| 42 |
-
<a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR"><b>📥 Model Download</b></a> |
|
| 43 |
-
<a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek_OCR_paper.pdf"><b>📄 Paper Link</b></a> |
|
| 44 |
-
<a href="https://arxiv.org/abs/2510.18234"><b>📄 Arxiv Paper Link</b></a> |
|
| 45 |
-
</p>
|
| 46 |
-
<h2>
|
| 47 |
-
<p align="center">
|
| 48 |
-
<a href="https://huggingface.co/papers/2510.18234">DeepSeek-OCR: Contexts Optical Compression</a>
|
| 49 |
-
</p>
|
| 50 |
-
</h2>
|
| 51 |
-
<p align="center">
|
| 52 |
-
<img src="assets/fig1.png" style="width: 1000px" align=center>
|
| 53 |
-
</p>
|
| 54 |
-
<p align="center">
|
| 55 |
-
<a href="https://huggingface.co/papers/2510.18234">Explore the boundaries of visual-text compression.</a>
|
| 56 |
-
</p>
|
| 57 |
-
|
| 58 |
-
## Usage
|
| 59 |
-
Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.12.9 + CUDA11.8:
|
| 60 |
-
|
| 61 |
-
```
|
| 62 |
-
torch==2.6.0
|
| 63 |
-
transformers==4.46.3
|
| 64 |
-
tokenizers==0.20.3
|
| 65 |
-
einops
|
| 66 |
-
addict
|
| 67 |
-
easydict
|
| 68 |
-
pip install flash-attn==2.7.3 --no-build-isolation
|
| 69 |
-
```
|
| 70 |
-
|
| 71 |
-
```python
|
| 72 |
-
from transformers import AutoModel, AutoTokenizer
|
| 73 |
-
import torch
|
| 74 |
-
import os
|
| 75 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
|
| 76 |
-
model_name = 'deepseek-ai/DeepSeek-OCR'
|
| 77 |
-
|
| 78 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 79 |
-
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
|
| 80 |
-
model = model.eval().cuda().to(torch.bfloat16)
|
| 81 |
-
|
| 82 |
-
# prompt = "<image>\nFree OCR. "
|
| 83 |
-
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 84 |
-
image_file = 'your_image.jpg'
|
| 85 |
-
output_path = 'your/output/dir'
|
| 86 |
-
|
| 87 |
-
# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
|
| 88 |
-
|
| 89 |
-
# Tiny: base_size = 512, image_size = 512, crop_mode = False
|
| 90 |
-
# Small: base_size = 640, image_size = 640, crop_mode = False
|
| 91 |
-
# Base: base_size = 1024, image_size = 1024, crop_mode = False
|
| 92 |
-
# Large: base_size = 1280, image_size = 1280, crop_mode = False
|
| 93 |
-
|
| 94 |
-
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
|
| 95 |
-
|
| 96 |
-
res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)
|
| 97 |
-
```
|
| 98 |
-
|
| 99 |
-
## vLLM
|
| 100 |
-
Refer to [🌟GitHub](https://github.com/deepseek-ai/DeepSeek-OCR/) for guidance on model inference acceleration and PDF processing, etc.<!-- -->
|
| 101 |
-
|
| 102 |
-
[2025/10/23] 🚀🚀🚀 DeepSeek-OCR is now officially supported in upstream [vLLM](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR.html#installing-vllm).
|
| 103 |
-
```shell
|
| 104 |
-
uv venv
|
| 105 |
-
source .venv/bin/activate
|
| 106 |
-
# Until v0.11.1 release, you need to install vLLM from nightly build
|
| 107 |
-
uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
|
| 108 |
-
```
|
| 109 |
-
|
| 110 |
-
```python
|
| 111 |
-
from vllm import LLM, SamplingParams
|
| 112 |
-
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
|
| 113 |
-
from PIL import Image
|
| 114 |
-
|
| 115 |
-
# Create model instance
|
| 116 |
-
llm = LLM(
|
| 117 |
-
model="deepseek-ai/DeepSeek-OCR",
|
| 118 |
-
enable_prefix_caching=False,
|
| 119 |
-
mm_processor_cache_gb=0,
|
| 120 |
-
logits_processors=[NGramPerReqLogitsProcessor]
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
# Prepare batched input with your image file
|
| 124 |
-
image_1 = Image.open("path/to/your/image_1.png").convert("RGB")
|
| 125 |
-
image_2 = Image.open("path/to/your/image_2.png").convert("RGB")
|
| 126 |
-
prompt = "<image>\nFree OCR."
|
| 127 |
-
|
| 128 |
-
model_input = [
|
| 129 |
-
{
|
| 130 |
-
"prompt": prompt,
|
| 131 |
-
"multi_modal_data": {"image": image_1}
|
| 132 |
-
},
|
| 133 |
-
{
|
| 134 |
-
"prompt": prompt,
|
| 135 |
-
"multi_modal_data": {"image": image_2}
|
| 136 |
-
}
|
| 137 |
-
]
|
| 138 |
-
|
| 139 |
-
sampling_param = SamplingParams(
|
| 140 |
-
temperature=0.0,
|
| 141 |
-
max_tokens=8192,
|
| 142 |
-
# ngram logit processor args
|
| 143 |
-
extra_args=dict(
|
| 144 |
-
ngram_size=30,
|
| 145 |
-
window_size=90,
|
| 146 |
-
whitelist_token_ids={128821, 128822}, # whitelist: <td>, </td>
|
| 147 |
-
),
|
| 148 |
-
skip_special_tokens=False,
|
| 149 |
-
)
|
| 150 |
-
# Generate output
|
| 151 |
-
model_outputs = llm.generate(model_input, sampling_param)
|
| 152 |
-
|
| 153 |
-
# Print output
|
| 154 |
-
for output in model_outputs:
|
| 155 |
-
print(output.outputs[0].text)
|
| 156 |
-
```
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
## Visualizations
|
| 160 |
-
<table>
|
| 161 |
-
<tr>
|
| 162 |
-
<td><img src="assets/show1.jpg" style="width: 500px"></td>
|
| 163 |
-
<td><img src="assets/show2.jpg" style="width: 500px"></td>
|
| 164 |
-
</tr>
|
| 165 |
-
<tr>
|
| 166 |
-
<td><img src="assets/show3.jpg" style="width: 500px"></td>
|
| 167 |
-
<td><img src="assets/show4.jpg" style="width: 500px"></td>
|
| 168 |
-
</tr>
|
| 169 |
-
</table>
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
## Acknowledgement
|
| 173 |
-
|
| 174 |
-
We would like to thank [Vary](https://github.com/Ucas-HaoranWei/Vary/), [GOT-OCR2.0](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [OneChart](https://github.com/LingyvKong/OneChart), [Slow Perception](https://github.com/Ucas-HaoranWei/Slow-Perception) for their valuable models and ideas.
|
| 175 |
-
|
| 176 |
-
We also appreciate the benchmarks: [Fox](https://github.com/ucaslcl/Fox), [OminiDocBench](https://github.com/opendatalab/OmniDocBench).
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
## Citation
|
| 180 |
-
```bibtex
|
| 181 |
-
@article{wei2025deepseek,
|
| 182 |
-
title={DeepSeek-OCR: Contexts Optical Compression},
|
| 183 |
-
author={Wei, Haoran and Sun, Yaofeng and Li, Yukun},
|
| 184 |
-
journal={arXiv preprint arXiv:2510.18234},
|
| 185 |
-
year={2025}
|
| 186 |
-
}
|
|
|
|
| 10 |
license: mit
|
| 11 |
library_name: transformers
|
| 12 |
---
|
| 13 |
+
# Disclaimer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
This model is provided for experimental purposes only. Its accuracy, stability, and suitability for deployment are not guaranteed. Users are advised to independently evaluate the model before any practical or production use.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|