redhairedshanks1 commited on
Commit
b56e481
·
verified ·
1 Parent(s): 3a12210

Upload 61 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -35
  2. .gitignore +123 -0
  3. LICENSE +21 -0
  4. NOTICE +0 -0
  5. README.md +1228 -12
  6. assets/blog.md +1044 -0
  7. assets/chart.png +3 -0
  8. assets/logo.png +3 -0
  9. assets/showcase/Tibetan.png +3 -0
  10. assets/showcase/formula1.png +3 -0
  11. assets/showcase/formula2.png +3 -0
  12. assets/showcase/formula3.png +3 -0
  13. assets/showcase/grounding.png +3 -0
  14. assets/showcase/kannada.png +3 -0
  15. assets/showcase/nl.png +3 -0
  16. assets/showcase/reading_order.png +3 -0
  17. assets/showcase/russian.png +3 -0
  18. assets/showcase/table1.png +3 -0
  19. assets/showcase/table2.png +3 -0
  20. assets/showcase/table3.png +3 -0
  21. assets/showcase/tradition_zh.png +3 -0
  22. assets/showcase_origin/Tibetan.png +3 -0
  23. assets/showcase_origin/formula_1.jpg +3 -0
  24. assets/showcase_origin/formula_2.jpg +3 -0
  25. assets/showcase_origin/formula_3.jpg +3 -0
  26. assets/showcase_origin/kannada.jpg +3 -0
  27. assets/showcase_origin/nl.png +3 -0
  28. assets/showcase_origin/reading_order.png +3 -0
  29. assets/showcase_origin/russian.png +3 -0
  30. assets/showcase_origin/table_1.jpg +3 -0
  31. assets/showcase_origin/table_2.jpg +3 -0
  32. assets/showcase_origin/table_3.jpg +3 -0
  33. assets/showcase_origin/tradition_zh.png +3 -0
  34. assets/wechat.png +3 -0
  35. demo/demo_colab_remote_server.ipynb +0 -0
  36. demo/demo_gradio.py +726 -0
  37. demo/demo_gradio_annotion.py +666 -0
  38. demo/demo_hf.py +71 -0
  39. demo/demo_image1.jpg +3 -0
  40. demo/demo_pdf1.pdf +3 -0
  41. demo/demo_streamlit.py +222 -0
  42. demo/demo_vllm.py +42 -0
  43. demo/launch_model_vllm.sh +17 -0
  44. docker/Dockerfile +4 -0
  45. docker/docker-compose.yml +44 -0
  46. dots.ocr LICENSE AGREEMENT +109 -0
  47. dots_ocr/__init__.py +1 -0
  48. dots_ocr/model/inference.py +50 -0
  49. dots_ocr/parser.py +428 -0
  50. dots_ocr/utils/__init__.py +1 -0
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ weights/
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+
28
+ # PyInstaller
29
+ *.manifest
30
+ *.spec
31
+
32
+ # Installer logs
33
+ pip-log.txt
34
+ pip-delete-this-directory.txt
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ .hypothesis/
46
+ .pytest_cache/
47
+
48
+ # Translations
49
+ *.mo
50
+ *.pot
51
+
52
+ # Django stuff:
53
+ *.log
54
+ local_settings.py
55
+ db.sqlite3
56
+
57
+ # Flask stuff:
58
+ instance/
59
+ .webassets-cache
60
+
61
+ # Scrapy stuff:
62
+ .scrapy
63
+
64
+ # Sphinx documentation
65
+ docs/_build/
66
+
67
+ # PyBuilder
68
+ target/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints
72
+
73
+ # pyenv
74
+ .python-version
75
+
76
+ # celery beat schedule file
77
+ celerybeat-schedule
78
+
79
+ # SageMath parsed files
80
+ *.sage.py
81
+
82
+ # Environments
83
+ .env
84
+ .venv
85
+ env/
86
+ venv/
87
+ ENV/
88
+ env.bak/
89
+ venv.bak/
90
+
91
+ # Spyder project settings
92
+ .spyderproject
93
+ .spyproject
94
+
95
+ # Rope project settings
96
+ .ropeproject
97
+
98
+ # mkdocs documentation
99
+ /site
100
+
101
+ # mypy
102
+ .mypy_cache/
103
+ .dmypy.json
104
+ dmypy.json
105
+
106
+ # IDEs
107
+ .vscode/
108
+ .idea/
109
+ *.swp
110
+ *.swo
111
+ *~
112
+
113
+ # MacOS
114
+ .DS_Store
115
+
116
+ # OCR related
117
+ #*.jpg
118
+ # *.jpeg
119
+ #*.png
120
+ #*.pdf
121
+ temp/
122
+ output/
123
+ # playground/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 rednote-hilab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
NOTICE ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,12 +1,1228 @@
1
- ---
2
- title: Dots Ocr
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <p align="center">
4
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/logo.png" width="300"/>
5
+ <p>
6
+
7
+ <h1 align="center">
8
+ dots.ocr: Multilingual Document Layout Parsing in a Single Vision-Language Model
9
+ </h1>
10
+
11
+ [![Blog](https://img.shields.io/badge/Blog-View_on_GitHub-333.svg?logo=github)](https://github.com/rednote-hilab/dots.ocr/blob/master/assets/blog.md)
12
+ [![HuggingFace](https://img.shields.io/badge/HuggingFace%20Weights-black.svg?logo=HuggingFace)](https://huggingface.co/rednote-hilab/dots.ocr)
13
+
14
+
15
+ <div align="center">
16
+ <a href="https://dotsocr.xiaohongshu.com" target="_blank" rel="noopener noreferrer"><strong>🖥️ Live Demo</strong></a> |
17
+ <a href="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/wechat.png" target="_blank" rel="noopener noreferrer"><strong>💬 WeChat</strong></a> |
18
+ <a href="https://www.xiaohongshu.com/user/profile/683ffe42000000001d021a4c" target="_blank" rel="noopener noreferrer"><strong>📕 rednote</strong></a> |
19
+ <a href="https://x.com/rednotehilab" target="_blank" rel="noopener noreferrer"><strong>🐦 X</strong></a>
20
+ </div>
21
+
22
+ </div>
23
+
24
+
25
+
26
+ ## Introduction
27
+
28
+ **dots.ocr** is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance.
29
+
30
+ 1. **Powerful Performance:** **dots.ocr** achieves SOTA performance for text, tables, and reading order on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), while delivering formula recognition results comparable to much larger models like Doubao-1.5 and gemini2.5-pro.
31
+ 2. **Multilingual Support:** **dots.ocr** demonstrates robust parsing capabilities for low-resource languages, achieving decisive advantages across both layout detection and content recognition on our in-house multilingual documents benchmark.
32
+ 3. **Unified and Simple Architecture:** By leveraging a single vision-language model, **dots.ocr** offers a significantly more streamlined architecture than conventional methods that rely on complex, multi-model pipelines. Switching between tasks is accomplished simply by altering the input prompt, proving that a VLM can achieve competitive detection results compared to traditional detection models like DocLayout-YOLO.
33
+ 4. **Efficient and Fast Performance:** Built upon a compact 1.7B LLM, **dots.ocr** provides faster inference speeds than many other high-performing models based on larger foundations.
34
+
35
+
36
+ ### Performance Comparison: dots.ocr vs. Competing Models
37
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/chart.png" border="0" />
38
+
39
+ > **Notes:**
40
+ > - The EN, ZH metrics are the end2end evaluation results of [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and Multilingual metric is the end2end evaluation results of dots.ocr-bench.
41
+
42
+
43
+ ## News
44
+ * ```2025.07.30 ``` 🚀 We release [dots.ocr](https://github.com/rednote-hilab/dots.ocr), — a multilingual documents parsing model based on 1.7b llm, with SOTA performance.
45
+
46
+
47
+
48
+ ## Benchmark Results
49
+
50
+ ### 1. OmniDocBench
51
+
52
+ #### The end-to-end evaluation results of different tasks.
53
+
54
+ <table>
55
+ <thead>
56
+ <tr>
57
+ <th rowspan="2"><strong>Model<br>Type</strong></th>
58
+ <th rowspan="2"><strong>Methods</strong></th>
59
+ <th colspan="2"><strong>Overall<sup>Edit</sup>↓</strong></th>
60
+ <th colspan="2"><strong>Text<sup>Edit</sup>↓</strong></th>
61
+ <th colspan="2"><strong>Formula<sup>Edit</sup>↓</strong></th>
62
+ <th colspan="2"><strong>Table<sup>TEDS</sup>↑</strong></th>
63
+ <th colspan="2"><strong>Table<sup>Edit</sup>↓</strong></th>
64
+ <th colspan="2"><strong>Read Order<sup>Edit</sup>↓</strong></th>
65
+ </tr>
66
+ <tr>
67
+ <th><em>EN</em></th>
68
+ <th><em>ZH</em></th>
69
+ <th><em>EN</em></th>
70
+ <th><em>ZH</em></th>
71
+ <th><em>EN</em></th>
72
+ <th><em>ZH</em></th>
73
+ <th><em>EN</em></th>
74
+ <th><em>ZH</em></th>
75
+ <th><em>EN</em></th>
76
+ <th><em>ZH</em></th>
77
+ <th><em>EN</em></th>
78
+ <th><em>ZH</em></th>
79
+ </tr>
80
+ </thead>
81
+ <tbody>
82
+ <tr>
83
+ <td rowspan="8"><strong>Pipeline<br>Tools</strong></td>
84
+ <td>MinerU</td>
85
+ <td>0.150</td>
86
+ <td>0.357</td>
87
+ <td>0.061</td>
88
+ <td>0.215</td>
89
+ <td>0.278</td>
90
+ <td>0.577</td>
91
+ <td>78.6</td>
92
+ <td>62.1</td>
93
+ <td>0.180</td>
94
+ <td>0.344</td>
95
+ <td>0.079</td>
96
+ <td>0.292</td>
97
+ </tr>
98
+ <tr>
99
+ <td>Marker</td>
100
+ <td>0.336</td>
101
+ <td>0.556</td>
102
+ <td>0.080</td>
103
+ <td>0.315</td>
104
+ <td>0.530</td>
105
+ <td>0.883</td>
106
+ <td>67.6</td>
107
+ <td>49.2</td>
108
+ <td>0.619</td>
109
+ <td>0.685</td>
110
+ <td>0.114</td>
111
+ <td>0.340</td>
112
+ </tr>
113
+ <tr>
114
+ <td>Mathpix</td>
115
+ <td>0.191</td>
116
+ <td>0.365</td>
117
+ <td>0.105</td>
118
+ <td>0.384</td>
119
+ <td>0.306</td>
120
+ <td>0.454</td>
121
+ <td>77.0</td>
122
+ <td>67.1</td>
123
+ <td>0.243</td>
124
+ <td>0.320</td>
125
+ <td>0.108</td>
126
+ <td>0.304</td>
127
+ </tr>
128
+ <tr>
129
+ <td>Docling</td>
130
+ <td>0.589</td>
131
+ <td>0.909</td>
132
+ <td>0.416</td>
133
+ <td>0.987</td>
134
+ <td>0.999</td>
135
+ <td>1</td>
136
+ <td>61.3</td>
137
+ <td>25.0</td>
138
+ <td>0.627</td>
139
+ <td>0.810</td>
140
+ <td>0.313</td>
141
+ <td>0.837</td>
142
+ </tr>
143
+ <tr>
144
+ <td>Pix2Text</td>
145
+ <td>0.320</td>
146
+ <td>0.528</td>
147
+ <td>0.138</td>
148
+ <td>0.356</td>
149
+ <td>0.276</td>
150
+ <td>0.611</td>
151
+ <td>73.6</td>
152
+ <td>66.2</td>
153
+ <td>0.584</td>
154
+ <td>0.645</td>
155
+ <td>0.281</td>
156
+ <td>0.499</td>
157
+ </tr>
158
+ <tr>
159
+ <td>Unstructured</td>
160
+ <td>0.586</td>
161
+ <td>0.716</td>
162
+ <td>0.198</td>
163
+ <td>0.481</td>
164
+ <td>0.999</td>
165
+ <td>1</td>
166
+ <td>0</td>
167
+ <td>0.06</td>
168
+ <td>1</td>
169
+ <td>0.998</td>
170
+ <td>0.145</td>
171
+ <td>0.387</td>
172
+ </tr>
173
+ <tr>
174
+ <td>OpenParse</td>
175
+ <td>0.646</td>
176
+ <td>0.814</td>
177
+ <td>0.681</td>
178
+ <td>0.974</td>
179
+ <td>0.996</td>
180
+ <td>1</td>
181
+ <td>64.8</td>
182
+ <td>27.5</td>
183
+ <td>0.284</td>
184
+ <td>0.639</td>
185
+ <td>0.595</td>
186
+ <td>0.641</td>
187
+ </tr>
188
+ <tr>
189
+ <td>PPStruct-V3</td>
190
+ <td>0.145</td>
191
+ <td>0.206</td>
192
+ <td>0.058</td>
193
+ <td>0.088</td>
194
+ <td>0.295</td>
195
+ <td>0.535</td>
196
+ <td>-</td>
197
+ <td>-</td>
198
+ <td>0.159</td>
199
+ <td>0.109</td>
200
+ <td>0.069</td>
201
+ <td>0.091</td>
202
+ </tr>
203
+ <tr>
204
+ <td rowspan="9"><strong>Expert<br>VLMs</strong></td>
205
+ <td>GOT-OCR</td>
206
+ <td>0.287</td>
207
+ <td>0.411</td>
208
+ <td>0.189</td>
209
+ <td>0.315</td>
210
+ <td>0.360</td>
211
+ <td>0.528</td>
212
+ <td>53.2</td>
213
+ <td>47.2</td>
214
+ <td>0.459</td>
215
+ <td>0.520</td>
216
+ <td>0.141</td>
217
+ <td>0.280</td>
218
+ </tr>
219
+ <tr>
220
+ <td>Nougat</td>
221
+ <td>0.452</td>
222
+ <td>0.973</td>
223
+ <td>0.365</td>
224
+ <td>0.998</td>
225
+ <td>0.488</td>
226
+ <td>0.941</td>
227
+ <td>39.9</td>
228
+ <td>0</td>
229
+ <td>0.572</td>
230
+ <td>1.000</td>
231
+ <td>0.382</td>
232
+ <td>0.954</td>
233
+ </tr>
234
+ <tr>
235
+ <td>Mistral OCR</td>
236
+ <td>0.268</td>
237
+ <td>0.439</td>
238
+ <td>0.072</td>
239
+ <td>0.325</td>
240
+ <td>0.318</td>
241
+ <td>0.495</td>
242
+ <td>75.8</td>
243
+ <td>63.6</td>
244
+ <td>0.600</td>
245
+ <td>0.650</td>
246
+ <td>0.083</td>
247
+ <td>0.284</td>
248
+ </tr>
249
+ <tr>
250
+ <td>OLMOCR-sglang</td>
251
+ <td>0.326</td>
252
+ <td>0.469</td>
253
+ <td>0.097</td>
254
+ <td>0.293</td>
255
+ <td>0.455</td>
256
+ <td>0.655</td>
257
+ <td>68.1</td>
258
+ <td>61.3</td>
259
+ <td>0.608</td>
260
+ <td>0.652</td>
261
+ <td>0.145</td>
262
+ <td>0.277</td>
263
+ </tr>
264
+ <tr>
265
+ <td>SmolDocling-256M</td>
266
+ <td>0.493</td>
267
+ <td>0.816</td>
268
+ <td>0.262</td>
269
+ <td>0.838</td>
270
+ <td>0.753</td>
271
+ <td>0.997</td>
272
+ <td>44.9</td>
273
+ <td>16.5</td>
274
+ <td>0.729</td>
275
+ <td>0.907</td>
276
+ <td>0.227</td>
277
+ <td>0.522</td>
278
+ </tr>
279
+ <tr>
280
+ <td>Dolphin</td>
281
+ <td>0.206</td>
282
+ <td>0.306</td>
283
+ <td>0.107</td>
284
+ <td>0.197</td>
285
+ <td>0.447</td>
286
+ <td>0.580</td>
287
+ <td>77.3</td>
288
+ <td>67.2</td>
289
+ <td>0.180</td>
290
+ <td>0.285</td>
291
+ <td>0.091</td>
292
+ <td>0.162</td>
293
+ </tr>
294
+ <tr>
295
+ <td>MinerU 2</td>
296
+ <td>0.139</td>
297
+ <td>0.240</td>
298
+ <td>0.047</td>
299
+ <td>0.109</td>
300
+ <td>0.297</td>
301
+ <td>0.536</td>
302
+ <td>82.5</td>
303
+ <td>79.0</td>
304
+ <td>0.141</td>
305
+ <td>0.195</td>
306
+ <td>0.069<</td>
307
+ <td>0.118</td>
308
+ </tr>
309
+ <tr>
310
+ <td>OCRFlux</td>
311
+ <td>0.195</td>
312
+ <td>0.281</td>
313
+ <td>0.064</td>
314
+ <td>0.183</td>
315
+ <td>0.379</td>
316
+ <td>0.613</td>
317
+ <td>71.6</td>
318
+ <td>81.3</td>
319
+ <td>0.253</td>
320
+ <td>0.139</td>
321
+ <td>0.086</td>
322
+ <td>0.187</td>
323
+ </tr>
324
+ <tr>
325
+ <td>MonkeyOCR-pro-3B</td>
326
+ <td>0.138</td>
327
+ <td>0.206</td>
328
+ <td>0.067</td>
329
+ <td>0.107</td>
330
+ <td><strong>0.246</strong></td>
331
+ <td>0.421</td>
332
+ <td>81.5</td>
333
+ <td>87.5</td>
334
+ <td>0.139</td>
335
+ <td>0.111</td>
336
+ <td>0.100</td>
337
+ <td>0.185</td>
338
+ </tr>
339
+ <tr>
340
+
341
+ <td rowspan="5"><strong>General<br>VLMs</strong></td>
342
+ <td>GPT4o</td>
343
+ <td>0.233</td>
344
+ <td>0.399</td>
345
+ <td>0.144</td>
346
+ <td>0.409</td>
347
+ <td>0.425</td>
348
+ <td>0.606</td>
349
+ <td>72.0</td>
350
+ <td>62.9</td>
351
+ <td>0.234</td>
352
+ <td>0.329</td>
353
+ <td>0.128</td>
354
+ <td>0.251</td>
355
+ </tr>
356
+ <tr>
357
+ <td>Qwen2-VL-72B</td>
358
+ <td>0.252</td>
359
+ <td>0.327</td>
360
+ <td>0.096</td>
361
+ <td>0.218</td>
362
+ <td>0.404</td>
363
+ <td>0.487</td>
364
+ <td>76.8</td>
365
+ <td>76.4</td>
366
+ <td>0.387</td>
367
+ <td>0.408</td>
368
+ <td>0.119</td>
369
+ <td>0.193</td>
370
+ </tr>
371
+ <tr>
372
+ <td>Qwen2.5-VL-72B</td>
373
+ <td>0.214</td>
374
+ <td>0.261</td>
375
+ <td>0.092</td>
376
+ <td>0.18</td>
377
+ <td>0.315</td>
378
+ <td>0.434</td>
379
+ <td>82.9</td>
380
+ <td>83.9</td>
381
+ <td>0.341</td>
382
+ <td>0.262</td>
383
+ <td>0.106</td>
384
+ <td>0.168</td>
385
+ </tr>
386
+ <tr>
387
+ <td>Gemini2.5-Pro</td>
388
+ <td>0.148</td>
389
+ <td>0.212</td>
390
+ <td>0.055</td>
391
+ <td>0.168</td>
392
+ <td>0.356</td>
393
+ <td>0.439</td>
394
+ <td>85.8</td>
395
+ <td>86.4</td>
396
+ <td>0.13</td>
397
+ <td>0.119</td>
398
+ <td>0.049</td>
399
+ <td>0.121</td>
400
+ </tr>
401
+ <tr>
402
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
403
+ <td>0.140</td>
404
+ <td>0.162</td>
405
+ <td>0.043</td>
406
+ <td>0.085</td>
407
+ <td>0.295</td>
408
+ <td><strong>0.384</strong></td>
409
+ <td>83.3</td>
410
+ <td><strong>89.3</strong></td>
411
+ <td>0.165</td>
412
+ <td><strong>0.085</strong></td>
413
+ <td>0.058</td>
414
+ <td>0.094</td>
415
+ </tr>
416
+ <tr>
417
+ <td rowspan="1"><strong>Expert VLMs</strong></td>
418
+ <td><strong>dots.ocr</strong></td>
419
+ <td><strong>0.125</strong></td>
420
+ <td><strong>0.160</strong></td>
421
+ <td><strong>0.032</strong></td>
422
+ <td><strong>0.066</strong></td>
423
+ <td>0.329</td>
424
+ <td>0.416</td>
425
+ <td><strong>88.6</strong></td>
426
+ <td>89.0</td>
427
+ <td><strong>0.099</strong></td>
428
+ <td>0.092</td>
429
+ <td><strong>0.040</strong></td>
430
+ <td><strong>0.067</strong></td>
431
+ </tr>
432
+ <tr>
433
+ </tbody>
434
+ </table>
435
+
436
+
437
+ #### The end-to-end text recognition performance across 9 PDF page types.
438
+
439
+ <table>
440
+ <thead>
441
+ <tr>
442
+ <th><strong>Model<br>Type</strong></th>
443
+ <th><strong>Models</strong></th>
444
+ <th><strong>Book</strong></th>
445
+ <th><strong>Slides</strong></th>
446
+ <th><strong>Financial<br>Report</strong></th>
447
+ <th><strong>Textbook</strong></th>
448
+ <th><strong>Exam<br>Paper</strong></th>
449
+ <th><strong>Magazine</strong></th>
450
+ <th><strong>Academic<br>Papers</strong></th>
451
+ <th><strong>Notes</strong></th>
452
+ <th><strong>Newspaper</strong></th>
453
+ <th><strong>Overall</strong></th>
454
+ </tr>
455
+ </thead>
456
+ <tbody>
457
+ <tr>
458
+ <td rowspan="3"><strong>Pipeline<br>Tools</strong></td>
459
+ <td>MinerU</td>
460
+ <td>0.055</td>
461
+ <td>0.124</td>
462
+ <td><u>0.033</u></td>
463
+ <td>0.102</td>
464
+ <td>0.159</td>
465
+ <td><strong>0.072</strong></td>
466
+ <td><u>0.025</u></td>
467
+ <td>0.984</td>
468
+ <td>0.171</td>
469
+ <td>0.206</td>
470
+ </tr>
471
+ <tr>
472
+ <td>Marker</td>
473
+ <td>0.074</td>
474
+ <td>0.340</td>
475
+ <td>0.089</td>
476
+ <td>0.319</td>
477
+ <td>0.452</td>
478
+ <td>0.153</td>
479
+ <td>0.059</td>
480
+ <td>0.651</td>
481
+ <td>0.192</td>
482
+ <td>0.274</td>
483
+ </tr>
484
+ <tr>
485
+ <td>Mathpix</td>
486
+ <td>0.131</td>
487
+ <td>0.220</td>
488
+ <td>0.202</td>
489
+ <td>0.216</td>
490
+ <td>0.278</td>
491
+ <td>0.147</td>
492
+ <td>0.091</td>
493
+ <td>0.634</td>
494
+ <td>0.690</td>
495
+ <td>0.300</td>
496
+ </tr>
497
+ <tr>
498
+ <td rowspan="5"><strong>Expert<br>VLMs</strong></td>
499
+ <td>GOT-OCR</td>
500
+ <td>0.111</td>
501
+ <td>0.222</td>
502
+ <td>0.067</td>
503
+ <td>0.132</td>
504
+ <td>0.204</td>
505
+ <td>0.198</td>
506
+ <td>0.179</td>
507
+ <td>0.388</td>
508
+ <td>0.771</td>
509
+ <td>0.267</td>
510
+ </tr>
511
+ <tr>
512
+ <td>Nougat</td>
513
+ <td>0.734</td>
514
+ <td>0.958</td>
515
+ <td>1.000</td>
516
+ <td>0.820</td>
517
+ <td>0.930</td>
518
+ <td>0.830</td>
519
+ <td>0.214</td>
520
+ <td>0.991</td>
521
+ <td>0.871</td>
522
+ <td>0.806</td>
523
+ </tr>
524
+ <tr>
525
+ <td>Dolphin</td>
526
+ <td>0.091</td>
527
+ <td>0.131</td>
528
+ <td>0.057</td>
529
+ <td>0.146</td>
530
+ <td>0.231</td>
531
+ <td>0.121</td>
532
+ <td>0.074</td>
533
+ <td>0.363</td>
534
+ <td>0.307</td>
535
+ <td>0.177</td>
536
+ </tr>
537
+ <tr>
538
+ <td>OCRFlux</td>
539
+ <td>0.068</td>
540
+ <td>0.125</td>
541
+ <td>0.092</td>
542
+ <td>0.102</td>
543
+ <td>0.119</td>
544
+ <td>0.083</td>
545
+ <td>0.047</td>
546
+ <td>0.223</td>
547
+ <td>0.536</td>
548
+ <td>0.149</td>
549
+ </tr>
550
+ <tr>
551
+ <td>MonkeyOCR-pro-3B</td>
552
+ <td>0.084</td>
553
+ <td>0.129</td>
554
+ <td>0.060</td>
555
+ <td>0.090</td>
556
+ <td>0.107</td>
557
+ <td>0.073</td>
558
+ <td>0.050</td>
559
+ <td>0.171</td>
560
+ <td>0.107</td>
561
+ <td>0.100</td>
562
+ </tr>
563
+ <tr>
564
+ <td rowspan="4"><strong>General<br>VLMs</strong></td>
565
+ <td>GPT4o</td>
566
+ <td>0.157</td>
567
+ <td>0.163</td>
568
+ <td>0.348</td>
569
+ <td>0.187</td>
570
+ <td>0.281</td>
571
+ <td>0.173</td>
572
+ <td>0.146</td>
573
+ <td>0.607</td>
574
+ <td>0.751</td>
575
+ <td>0.316</td>
576
+ </tr>
577
+ <tr>
578
+ <td>Qwen2.5-VL-7B</td>
579
+ <td>0.148</td>
580
+ <td>0.053</td>
581
+ <td>0.111</td>
582
+ <td>0.137</td>
583
+ <td>0.189</td>
584
+ <td>0.117</td>
585
+ <td>0.134</td>
586
+ <td>0.204</td>
587
+ <td>0.706</td>
588
+ <td>0.205</td>
589
+ </tr>
590
+ <tr>
591
+ <td>InternVL3-8B</td>
592
+ <td>0.163</td>
593
+ <td>0.056</td>
594
+ <td>0.107</td>
595
+ <td>0.109</td>
596
+ <td>0.129</td>
597
+ <td>0.100</td>
598
+ <td>0.159</td>
599
+ <td>0.150</td>
600
+ <td>0.681</td>
601
+ <td>0.188</td>
602
+ </tr>
603
+ <tr>
604
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
605
+ <td>0.048</td>
606
+ <td>0.048</td>
607
+ <td>0.024</td>
608
+ <td><strong>0.062</strong></td>
609
+ <td>0.085</td>
610
+ <td>0.051</td>
611
+ <td>0.039</td>
612
+ <td><strong>0.096</strong></td>
613
+ <td>0.181</td>
614
+ <td>0.073</td>
615
+ </tr>
616
+ <tr>
617
+ <td rowspan="1"><strong>Expert VLMs</strong></td>
618
+ <td><strong>dots.ocr</strong></td>
619
+ <td><strong>0.031</strong></td>
620
+ <td><strong>0.047</strong></td>
621
+ <td><strong>0.011</strong></td>
622
+ <td>0.082</td>
623
+ <td><strong>0.079</strong></td>
624
+ <td><strong>0.028</strong></td>
625
+ <td><strong>0.029</strong></td>
626
+ <td>0.109</td>
627
+ <td><strong>0.056</strong></td>
628
+ <td><strong>0.055</strong></td>
629
+ </tr>
630
+
631
+ </tbody>
632
+ </table>
633
+
634
+ > **Notes:**
635
+ > - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
636
+ > - We delete the Page-header and Page-footer cells in the result markdown.
637
+ > - We use tikz_preprocess pipeline to upsample the images to dpi 200.
638
+
639
+
640
+ ### 2. **dots.ocr-bench**
641
+
642
+ This is an inhouse benchmark which contain 1493 pdf images with 100 languages.
643
+
644
+ #### The end-to-end evaluation results of different tasks.
645
+
646
+ <table>
647
+ <thead>
648
+ <tr>
649
+ <th rowspan="1"><strong>Methods</strong></th>
650
+ <th colspan="1"><strong>Overall<sup>Edit</sup>↓</strong></th>
651
+ <th colspan="1"><strong>Text<sup>Edit</sup>↓</strong></th>
652
+ <th colspan="1"><strong>Formula<sup>Edit</sup>↓</strong></th>
653
+ <th colspan="1"><strong>Table<sup>TEDS</sup>↑</strong></th>
654
+ <th colspan="1"><strong>Table<sup>Edit</sup>↓</strong></th>
655
+ <th colspan="1"><strong>Read Order<sup>Edit</sup>↓</strong></th>
656
+ </tr>
657
+ </thead>
658
+ <tbody>
659
+ <td>MonkeyOCR-3B</td>
660
+ <td>0.483</td>
661
+ <td>0.445</td>
662
+ <td>0.627</td>
663
+ <td>50.93</td>
664
+ <td>0.452</td>
665
+ <td>0.409</td>
666
+ </tr>
667
+ <tr>
668
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
669
+ <td>0.291</td>
670
+ <td>0.226</td>
671
+ <td>0.440</td>
672
+ <td>71.2</td>
673
+ <td>0.260</td>
674
+ <td>0.238</td>
675
+ </tr>
676
+ <tr>
677
+ <td>doubao-1-6</td>
678
+ <td>0.299</td>
679
+ <td>0.270</td>
680
+ <td>0.417</td>
681
+ <td>71.0</td>
682
+ <td>0.258</td>
683
+ <td>0.253</td>
684
+ </tr>
685
+ <tr>
686
+ <td>Gemini2.5-Pro</td>
687
+ <td>0.251</td>
688
+ <td>0.163</td>
689
+ <td>0.402</td>
690
+ <td>77.1</td>
691
+ <td>0.236</td>
692
+ <td>0.202</td>
693
+ </tr>
694
+ <tr>
695
+ <td><strong>dots.ocr</strong> </td>
696
+ <td><strong>0.177</strong></td>
697
+ <td><strong>0.075</strong></td>
698
+ <td><strong>0.297</strong></td>
699
+ <td><strong>79.2</strong></td>
700
+ <td><strong>0.186</strong></td>
701
+ <td><strong>0.152</strong></td>
702
+ </tr>
703
+
704
+ </tbody>
705
+ </table>
706
+
707
+ > **Notes:**
708
+ > - We use the same metric calculation pipeline of [OmniDocBench](https://github.com/opendatalab/OmniDocBench).
709
+ > - We delete the Page-header and Page-footer cells in the result markdown.
710
+
711
+ #### Layout Detection
712
+
713
+ <table>
714
+ <thead>
715
+ <tr>
716
+ <th rowspan="2"><strong>Method</strong></th>
717
+ <th colspan="5" style="text-align: center;"><strong>F1@IoU=.50:.05:.95↑</strong></th>
718
+ <th colspan="5" style="text-align: center;"><strong>F1@IoU=.50↑</strong></th>
719
+ </tr>
720
+ <tr>
721
+ <th>Overall</th>
722
+ <th>Text</th>
723
+ <th>Formula</th>
724
+ <th>Table</th>
725
+ <th>Picture</th>
726
+ <th>Overall</th>
727
+ <th>Text</th>
728
+ <th>Formula</th>
729
+ <th>Table</th>
730
+ <th>Picture</th>
731
+ </tr>
732
+ </thead>
733
+
734
+ <tbody>
735
+ <td>DocLayout-YOLO-DocStructBench</td>
736
+ <td>0.733</td>
737
+ <td>0.694</td>
738
+ <td>0.480</td>
739
+ <td>0.803</td>
740
+ <td>0.619</td>
741
+ <td>0.806</td>
742
+ <td>0.779</td>
743
+ <td>0.620</td>
744
+ <td>0.858</td>
745
+ <td>0.678</td>
746
+ </tr>
747
+
748
+ <tr>
749
+ <td>dots.ocr-parse all</td>
750
+ <td>0.831</td>
751
+ <td>0.801</td>
752
+ <td>0.654</td>
753
+ <td>0.838</td>
754
+ <td>0.748</td>
755
+ <td>0.922</td>
756
+ <td>0.909</td>
757
+ <td>0.770</td>
758
+ <td>0.888</td>
759
+ <td>0.831</td>
760
+ </tr>
761
+
762
+ <tr>
763
+ <td> <strong>dots.ocr-detection only</strong> </td>
764
+ <td><strong>0.845</strong></td>
765
+ <td><strong>0.816</strong></td>
766
+ <td><strong>0.716</strong></td>
767
+ <td><strong>0.875</strong></td>
768
+ <td><strong>0.765</strong></td>
769
+ <td><strong>0.930</strong></td>
770
+ <td><strong>0.917</strong></td>
771
+ <td><strong>0.832</strong></td>
772
+ <td><strong>0.918</strong></td>
773
+ <td><strong>0.843</strong></td>
774
+ </tr>
775
+
776
+ </tbody>
777
+ </table>
778
+
779
+ > **Notes:**
780
+ > - prompt_layout_all_en for **parse all**, prompt_layout_only_en for **detection only**, please refer to [prompts](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
781
+
782
+
783
+ ### 3. olmOCR-bench.
784
+
785
+ <table>
786
+ <thead>
787
+ <tr>
788
+ <th>Model</th>
789
+ <th>ArXiv</th>
790
+ <th>Old Scans<br>Math</th>
791
+ <th>Tables</th>
792
+ <th>Old Scans</th>
793
+ <th>Headers and<br>Footers</th>
794
+ <th>Multi<br>column</th>
795
+ <th>Long Tiny<br>Text</th>
796
+ <th>Base</th>
797
+ <th>Overall</th>
798
+ </tr>
799
+ </thead>
800
+ <tbody>
801
+ <tr>
802
+ <td>GOT OCR</td>
803
+ <td>52.7</td>
804
+ <td>52.0</td>
805
+ <td>0.2</td>
806
+ <td>22.1</td>
807
+ <td>93.6</td>
808
+ <td>42.0</td>
809
+ <td>29.9</td>
810
+ <td>94.0</td>
811
+ <td>48.3 ± 1.1</td>
812
+ </tr>
813
+ <tr>
814
+ <td>Marker</td>
815
+ <td>76.0</td>
816
+ <td>57.9</td>
817
+ <td>57.6</td>
818
+ <td>27.8</td>
819
+ <td>84.9</td>
820
+ <td>72.9</td>
821
+ <td>84.6</td>
822
+ <td>99.1</td>
823
+ <td>70.1 ± 1.1</td>
824
+ </tr>
825
+ <tr>
826
+ <td>MinerU</td>
827
+ <td>75.4</td>
828
+ <td>47.4</td>
829
+ <td>60.9</td>
830
+ <td>17.3</td>
831
+ <td><strong>96.6</strong></td>
832
+ <td>59.0</td>
833
+ <td>39.1</td>
834
+ <td>96.6</td>
835
+ <td>61.5 ± 1.1</td>
836
+ </tr>
837
+ <tr>
838
+ <td>Mistral OCR</td>
839
+ <td>77.2</td>
840
+ <td>67.5</td>
841
+ <td>60.6</td>
842
+ <td>29.3</td>
843
+ <td>93.6</td>
844
+ <td>71.3</td>
845
+ <td>77.1</td>
846
+ <td>99.4</td>
847
+ <td>72.0 ± 1.1</td>
848
+ </tr>
849
+ <tr>
850
+ <td>Nanonets OCR</td>
851
+ <td>67.0</td>
852
+ <td>68.6</td>
853
+ <td>77.7</td>
854
+ <td>39.5</td>
855
+ <td>40.7</td>
856
+ <td>69.9</td>
857
+ <td>53.4</td>
858
+ <td>99.3</td>
859
+ <td>64.5 ± 1.1</td>
860
+ </tr>
861
+ <tr>
862
+ <td>GPT-4o<br>(No Anchor)</td>
863
+ <td>51.5</td>
864
+ <td><strong>75.5</strong></td>
865
+ <td>69.1</td>
866
+ <td>40.9</td>
867
+ <td>94.2</td>
868
+ <td>68.9</td>
869
+ <td>54.1</td>
870
+ <td>96.7</td>
871
+ <td>68.9 ± 1.1</td>
872
+ </tr>
873
+ <tr>
874
+ <td>GPT-4o<br>(Anchored)</td>
875
+ <td>53.5</td>
876
+ <td>74.5</td>
877
+ <td>70.0</td>
878
+ <td>40.7</td>
879
+ <td>93.8</td>
880
+ <td>69.3</td>
881
+ <td>60.6</td>
882
+ <td>96.8</td>
883
+ <td>69.9 ± 1.1</td>
884
+ </tr>
885
+ <tr>
886
+ <td>Gemini Flash 2<br>(No Anchor)</td>
887
+ <td>32.1</td>
888
+ <td>56.3</td>
889
+ <td>61.4</td>
890
+ <td>27.8</td>
891
+ <td>48.0</td>
892
+ <td>58.7</td>
893
+ <td><strong>84.4</strong></td>
894
+ <td>94.0</td>
895
+ <td>57.8 ± 1.1</td>
896
+ </tr>
897
+ <tr>
898
+ <td>Gemini Flash 2<br>(Anchored)</td>
899
+ <td>54.5</td>
900
+ <td>56.1</td>
901
+ <td>72.1</td>
902
+ <td>34.2</td>
903
+ <td>64.7</td>
904
+ <td>61.5</td>
905
+ <td>71.5</td>
906
+ <td>95.6</td>
907
+ <td>63.8 ± 1.2</td>
908
+ </tr>
909
+ <tr>
910
+ <td>Qwen 2 VL<br>(No Anchor)</td>
911
+ <td>19.7</td>
912
+ <td>31.7</td>
913
+ <td>24.2</td>
914
+ <td>17.1</td>
915
+ <td>88.9</td>
916
+ <td>8.3</td>
917
+ <td>6.8</td>
918
+ <td>55.5</td>
919
+ <td>31.5 ± 0.9</td>
920
+ </tr>
921
+ <tr>
922
+ <td>Qwen 2.5 VL<br>(No Anchor)</td>
923
+ <td>63.1</td>
924
+ <td>65.7</td>
925
+ <td>67.3</td>
926
+ <td>38.6</td>
927
+ <td>73.6</td>
928
+ <td>68.3</td>
929
+ <td>49.1</td>
930
+ <td>98.3</td>
931
+ <td>65.5 ± 1.2</td>
932
+ </tr>
933
+ <tr>
934
+ <td>olmOCR v0.1.75<br>(No Anchor)</td>
935
+ <td>71.5</td>
936
+ <td>71.4</td>
937
+ <td>71.4</td>
938
+ <td><strong>42.8</strong></td>
939
+ <td>94.1</td>
940
+ <td>77.7</td>
941
+ <td>71.0</td>
942
+ <td>97.8</td>
943
+ <td>74.7 ± 1.1</td>
944
+ </tr>
945
+ <tr>
946
+ <td>olmOCR v0.1.75<br>(Anchored)</td>
947
+ <td>74.9</td>
948
+ <td>71.2</td>
949
+ <td>71.0</td>
950
+ <td>42.2</td>
951
+ <td>94.5</td>
952
+ <td>78.3</td>
953
+ <td>73.3</td>
954
+ <td>98.3</td>
955
+ <td>75.5 ± 1.0</td>
956
+ </tr>
957
+ <tr>
958
+ <td>MonkeyOCR-pro-3B</td>
959
+ <td><strong>83.8</strong></td>
960
+ <td>68.8</td>
961
+ <td>74.6</td>
962
+ <td>36.1</td>
963
+ <td>91.2</td>
964
+ <td>76.6</td>
965
+ <td>80.1</td>
966
+ <td>95.3</td>
967
+ <td>75.8 ± 1.0</td>
968
+ </tr>
969
+ <tr>
970
+ <td><strong>dots.ocr</strong></td>
971
+ <td>82.1</td>
972
+ <td>64.2</td>
973
+ <td><strong>88.3</strong></td>
974
+ <td>40.9</td>
975
+ <td>94.1</td>
976
+ <td><strong>82.4</strong></td>
977
+ <td>81.2</td>
978
+ <td><strong>99.5</strong></td>
979
+ <td><strong>79.1 ± 1.0</strong></td>
980
+ </tr>
981
+ </tbody>
982
+ </table>
983
+
984
+
985
+ > **Note:**
986
+ > - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
987
+ [olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
988
+ > - We delete the Page-header and Page-footer cells in the result markdown.
989
+
990
+
991
+
992
+ # Quick Start
993
+ ## 1. Installation
994
+ ### Install dots.ocr
995
+ ```shell
996
+ conda create -n dots_ocr python=3.12
997
+ conda activate dots_ocr
998
+
999
+ git clone https://github.com/rednote-hilab/dots.ocr.git
1000
+ cd dots.ocr
1001
+
1002
+ # Install pytorch, see https://pytorch.org/get-started/previous-versions/ for your cuda version
1003
+ pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
1004
+ pip install -e .
1005
+ ```
1006
+
1007
+ If you have trouble with the installation, try our [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) for an easier setup, and follow these steps:
1008
+ ```shell
1009
+ git clone https://github.com/rednote-hilab/dots.ocr.git
1010
+ cd dots.ocr
1011
+ pip install -e .
1012
+ ```
1013
+
1014
+
1015
+ ### Download Model Weights
1016
+ > 💡**Note:** Please use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
1017
+ ```shell
1018
+ python3 tools/download_model.py
1019
+
1020
+ # with modelscope
1021
+ python3 tools/download_model.py --type modelscope
1022
+ ```
1023
+
1024
+
1025
+ ## 2. Deployment
1026
+ ### vLLM inference
1027
+ We highly recommend using vllm for deployment and inference. All of our evaluations results are based on vllm version 0.9.1.
1028
+ The [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) is based on the official vllm image. You can also follow [Dockerfile](https://github.com/rednote-hilab/dots.ocr/blob/master/docker/Dockerfile) to build the deployment environment by yourself.
1029
+
1030
+ ```shell
1031
+ # You need to register model to vllm at first
1032
+ python3 tools/download_model.py
1033
+ export hf_model_path=./weights/DotsOCR # Path to your downloaded model weights, Please use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
1034
+ export PYTHONPATH=$(dirname "$hf_model_path"):$PYTHONPATH
1035
+ sed -i '/^from vllm\.entrypoints\.cli\.main import main$/a\
1036
+ from DotsOCR import modeling_dots_ocr_vllm' `which vllm` # If you downloaded model weights by yourself, please replace `DotsOCR` by your model saved directory name, and remember to use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`)
1037
+
1038
+ # launch vllm server
1039
+ CUDA_VISIBLE_DEVICES=0 vllm serve ${hf_model_path} --tensor-parallel-size 1 --gpu-memory-utilization 0.95 --chat-template-content-format string --served-model-name model --trust-remote-code
1040
+
1041
+ # If you get a ModuleNotFoundError: No module named 'DotsOCR', please check the note above on the saved model directory name.
1042
+
1043
+ # vllm api demo
1044
+ python3 ./demo/demo_vllm.py --prompt_mode prompt_layout_all_en
1045
+ ```
1046
+
1047
+ ### Hugginface inference
1048
+ ```shell
1049
+ python3 demo/demo_hf.py
1050
+ ```
1051
+
1052
+ <details>
1053
+ <summary><b>Hugginface inference details</b></summary>
1054
+
1055
+ ```python
1056
+ import torch
1057
+ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
1058
+ from qwen_vl_utils import process_vision_info
1059
+ from dots_ocr.utils import dict_promptmode_to_prompt
1060
+
1061
+ model_path = "./weights/DotsOCR"
1062
+ model = AutoModelForCausalLM.from_pretrained(
1063
+ model_path,
1064
+ attn_implementation="flash_attention_2",
1065
+ torch_dtype=torch.bfloat16,
1066
+ device_map="auto",
1067
+ trust_remote_code=True
1068
+ )
1069
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
1070
+
1071
+ image_path = "demo/demo_image1.jpg"
1072
+ prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
1073
+
1074
+ 1. Bbox format: [x1, y1, x2, y2]
1075
+
1076
+ 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
1077
+
1078
+ 3. Text Extraction & Formatting Rules:
1079
+ - Picture: For the 'Picture' category, the text field should be omitted.
1080
+ - Formula: Format its text as LaTeX.
1081
+ - Table: Format its text as HTML.
1082
+ - All Others (Text, Title, etc.): Format their text as Markdown.
1083
+
1084
+ 4. Constraints:
1085
+ - The output text must be the original text from the image, with no translation.
1086
+ - All layout elements must be sorted according to human reading order.
1087
+
1088
+ 5. Final Output: The entire output must be a single JSON object.
1089
+ """
1090
+
1091
+ messages = [
1092
+ {
1093
+ "role": "user",
1094
+ "content": [
1095
+ {
1096
+ "type": "image",
1097
+ "image": image_path
1098
+ },
1099
+ {"type": "text", "text": prompt}
1100
+ ]
1101
+ }
1102
+ ]
1103
+
1104
+ # Preparation for inference
1105
+ text = processor.apply_chat_template(
1106
+ messages,
1107
+ tokenize=False,
1108
+ add_generation_prompt=True
1109
+ )
1110
+ image_inputs, video_inputs = process_vision_info(messages)
1111
+ inputs = processor(
1112
+ text=[text],
1113
+ images=image_inputs,
1114
+ videos=video_inputs,
1115
+ padding=True,
1116
+ return_tensors="pt",
1117
+ )
1118
+
1119
+ inputs = inputs.to("cuda")
1120
+
1121
+ # Inference: Generation of the output
1122
+ generated_ids = model.generate(**inputs, max_new_tokens=24000)
1123
+ generated_ids_trimmed = [
1124
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
1125
+ ]
1126
+ output_text = processor.batch_decode(
1127
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
1128
+ )
1129
+ print(output_text)
1130
+
1131
+ ```
1132
+
1133
+ </details>
1134
+
1135
+ ### Hugginface inference with CPU
1136
+ Please refer to [CPU inference](https://github.com/rednote-hilab/dots.ocr/issues/1#issuecomment-3148962536)
1137
+
1138
+
1139
+ ## 3. Document Parse
1140
+ **Based on vLLM server**, you can parse an image or a pdf file using the following commands:
1141
+ ```bash
1142
+
1143
+ # Parse all layout info, both detection and recognition
1144
+ # Parse a single image
1145
+ python3 dots_ocr/parser.py demo/demo_image1.jpg
1146
+ # Parse a single PDF
1147
+ python3 dots_ocr/parser.py demo/demo_pdf1.pdf --num_thread 64 # try bigger num_threads for pdf with a large number of pages
1148
+
1149
+ # Layout detection only
1150
+ python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_layout_only_en
1151
+
1152
+ # Parse text only, except Page-header and Page-footer
1153
+ python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_ocr
1154
+
1155
+ # Parse layout info by bbox
1156
+ python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_grounding_ocr --bbox 163 241 1536 705
1157
+
1158
+ ```
1159
+ **Based on Transformers**, you can parse an image or a pdf file using the same commands above, just add `--use_hf true`.
1160
+
1161
+ > Notice: transformers is slower than vllm, if you want to use demo/* with transformers,just add `use_hf=True` in `DotsOCRParser(..,use_hf=True)`
1162
+
1163
+ <details>
1164
+ <summary><b>Output Results</b></summary>
1165
+
1166
+ 1. **Structured Layout Data** (`demo_image1.json`): A JSON file containing the detected layout elements, including their bounding boxes, categories, and extracted text.
1167
+ 2. **Processed Markdown File** (`demo_image1.md`): A Markdown file generated from the concatenated text of all detected cells.
1168
+ * An additional version, `demo_image1_nohf.md`, is also provided, which excludes page headers and footers for compatibility with benchmarks like Omnidocbench and olmOCR-bench.
1169
+ 3. **Layout Visualization** (`demo_image1.jpg`): The original image with the detected layout bounding boxes drawn on it.
1170
+
1171
+ </details>
1172
+
1173
+ ## 4. Demo
1174
+ You can run the demo with the following command, or try directly at [live demo](https://dotsocr.xiaohongshu.com/)
1175
+ ```bash
1176
+ python demo/demo_gradio.py
1177
+ ```
1178
+
1179
+ We also provide a demo for grounding ocr:
1180
+ ```bash
1181
+ python demo/demo_gradio_annotion.py
1182
+ ```
1183
+
1184
+
1185
+ ### Example for formula document
1186
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula1.png" alt="formula1.png" border="0" />
1187
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula2.png" alt="formula2.png" border="0" />
1188
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula3.png" alt="formula3.png" border="0" />
1189
+
1190
+ ### Example for table document
1191
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table1.png" alt="table1.png" border="0" />
1192
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table2.png" alt="table2.png" border="0" />
1193
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table3.png" alt="table3.png" border="0" />
1194
+
1195
+ ### Example for multilingual document
1196
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/Tibetan.png" alt="Tibetan.png" border="0" />
1197
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/tradition_zh.png" alt="tradition_zh.png" border="0" />
1198
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/nl.png" alt="nl.png" border="0" />
1199
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/kannada.png" alt="kannada.png" border="0" />
1200
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/russian.png" alt="russian.png" border="0" />
1201
+
1202
+ ### Example for reading order
1203
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/reading_order.png" alt="reading_order.png" border="0" />
1204
+
1205
+ ### Example for grounding ocr
1206
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/grounding.png" alt="grounding.png" border="0" />
1207
+
1208
+
1209
+ ## Acknowledgments
1210
+ We would like to thank [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL), [aimv2](https://github.com/apple/ml-aim), [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
1211
+ [OmniDocBench](https://github.com/opendatalab/OmniDocBench), [PyMuPDF](https://github.com/pymupdf/PyMuPDF), for providing code and models.
1212
+
1213
+ We also thank [DocLayNet](https://github.com/DS4SD/DocLayNet), [M6Doc](https://github.com/HCIILAB/M6Doc), [CDLA](https://github.com/buptlihang/CDLA), [D4LA](https://github.com/AlibabaResearch/AdvancedLiterateMachinery) for providing valuable datasets.
1214
+
1215
+ ## Limitation & Future Work
1216
+
1217
+ - **Complex Document Elements:**
1218
+ - **Table&Formula**: dots.ocr is not yet perfect for high-complexity tables and formula extraction.
1219
+ - **Picture**: Pictures in documents are currently not parsed.
1220
+
1221
+ - **Parsing Failures:** The model may fail to parse under certain conditions:
1222
+ - When the character-to-pixel ratio is excessively high. Try enlarging the image or increasing the PDF parsing DPI (a setting of 200 is recommended). However, please note that the model performs optimally on images with a resolution under 11289600 pixels.
1223
+ - Continuous special characters, such as ellipses (`...`) and underscores (`_`), may cause the prediction output to repeat endlessly. In such scenarios, consider using alternative prompts like `prompt_layout_only_en`, `prompt_ocr`, or `prompt_grounding_ocr` ([details here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)).
1224
+
1225
+ - **Performance Bottleneck:** Despite its 1.7B parameter LLM foundation, **dots.ocr** is not yet optimized for high-throughput processing of large PDF volumes.
1226
+
1227
+ We are committed to achieving more accurate table and formula parsing, as well as enhancing the model's OCR capabilities for broader generalization, all while aiming for **a more powerful, more efficient model**. Furthermore, we are actively considering the development of **a more general-purpose perception model** based on Vision-Language Models (VLMs), which would integrate general detection, image captioning, and OCR tasks into a unified framework. **Parsing the content of the pictures in the documents** is also a key priority for our future work.
1228
+ We believe that collaboration is the key to tackling these exciting challenges. If you are passionate about advancing the frontiers of document intelligence and are interested in contributing to these future endeavors, we would love to hear from you. Please reach out to us via email at: [yanqing4@xiaohongshu.com].
assets/blog.md ADDED
@@ -0,0 +1,1044 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+ dots.ocr: Multilingual Document Layout Parsing in a Single Vision-Language Model
3
+ </h1>
4
+
5
+
6
+ ## Introduction
7
+
8
+ **dots.ocr** is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance.
9
+
10
+ 1. **Powerful Performance:** **dots.ocr** achieves SOTA performance for text, tables, and reading order on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), while delivering formula recognition results comparable to much larger models like Doubao-1.5 and gemini2.5-pro.
11
+ 2. **Multilingual Support:** **dots.ocr** demonstrates robust parsing capabilities for low-resource languages, achieving decisive advantages across both layout detection and content recognition on our in-house multilingual documents benchmark.
12
+ 3. **Unified and Simple Architecture:** By leveraging a single vision-language model, **dots.ocr** offers a significantly more streamlined architecture than conventional methods that rely on complex, multi-model pipelines. Switching between tasks is accomplished simply by altering the input prompt, proving that a VLM can achieve competitive detection results compared to traditional detection models like DocLayout-YOLO.
13
+ 4. **Efficient and Fast Performance:** Built upon a compact 1.7B LLM, **dots.ocr** provides faster inference speeds than many other high-performing models based on larger foundations.
14
+
15
+
16
+ ### Performance Comparison on Document Parsing Benchmarks
17
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/chart.png" border="0" />
18
+
19
+ > **Notes:**
20
+ > - The EN, ZH metrics are the end2end evaluation results of [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and Multilingual metric is the end2end evaluation results of dots.ocr-bench.
21
+
22
+
23
+ ## Show Case
24
+ ### Example for formula document
25
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula1.png" alt="formula1.png" border="0" />
26
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula2.png" alt="formula2.png" border="0" />
27
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula3.png" alt="formula3.png" border="0" />
28
+
29
+ ### Example for table document
30
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table1.png" alt="table1.png" border="0" />
31
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table2.png" alt="table2.png" border="0" />
32
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table3.png" alt="table3.png" border="0" />
33
+
34
+ ### Example for multilingual document
35
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/Tibetan.png" alt="Tibetan.png" border="0" />
36
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/tradition_zh.png" alt="tradition_zh.png" border="0" />
37
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/nl.png" alt="nl.png" border="0" />
38
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/kannada.png" alt="kannada.png" border="0" />
39
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/russian.png" alt="russian.png" border="0" />
40
+
41
+ ### Example for reading order
42
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/reading_order.png" alt="reading_order.png" border="0" />
43
+
44
+ ### Example for grounding ocr
45
+ <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/grounding.png" alt="grounding.png" border="0" />
46
+
47
+
48
+
49
+ ## Benchmark Results
50
+
51
+ ### 1. OmniDocBench
52
+
53
+ #### The end-to-end evaluation results of different tasks.
54
+
55
+ <table>
56
+ <thead>
57
+ <tr>
58
+ <th rowspan="2"><strong>Model<br>Type</strong></th>
59
+ <th rowspan="2"><strong>Methods</strong></th>
60
+ <th colspan="2"><strong>Overall<sup>Edit</sup>↓</strong></th>
61
+ <th colspan="2"><strong>Text<sup>Edit</sup>↓</strong></th>
62
+ <th colspan="2"><strong>Formula<sup>Edit</sup>↓</strong></th>
63
+ <th colspan="2"><strong>Table<sup>TEDS</sup>↑</strong></th>
64
+ <th colspan="2"><strong>Table<sup>Edit</sup>↓</strong></th>
65
+ <th colspan="2"><strong>Read Order<sup>Edit</sup>↓</strong></th>
66
+ </tr>
67
+ <tr>
68
+ <th><em>EN</em></th>
69
+ <th><em>ZH</em></th>
70
+ <th><em>EN</em></th>
71
+ <th><em>ZH</em></th>
72
+ <th><em>EN</em></th>
73
+ <th><em>ZH</em></th>
74
+ <th><em>EN</em></th>
75
+ <th><em>ZH</em></th>
76
+ <th><em>EN</em></th>
77
+ <th><em>ZH</em></th>
78
+ <th><em>EN</em></th>
79
+ <th><em>ZH</em></th>
80
+ </tr>
81
+ </thead>
82
+ <tbody>
83
+ <tr>
84
+ <td rowspan="8"><strong>Pipeline<br>Tools</strong></td>
85
+ <td>MinerU</td>
86
+ <td>0.150</td>
87
+ <td>0.357</td>
88
+ <td>0.061</td>
89
+ <td>0.215</td>
90
+ <td>0.278</td>
91
+ <td>0.577</td>
92
+ <td>78.6</td>
93
+ <td>62.1</td>
94
+ <td>0.180</td>
95
+ <td>0.344</td>
96
+ <td>0.079</td>
97
+ <td>0.292</td>
98
+ </tr>
99
+ <tr>
100
+ <td>Marker</td>
101
+ <td>0.336</td>
102
+ <td>0.556</td>
103
+ <td>0.080</td>
104
+ <td>0.315</td>
105
+ <td>0.530</td>
106
+ <td>0.883</td>
107
+ <td>67.6</td>
108
+ <td>49.2</td>
109
+ <td>0.619</td>
110
+ <td>0.685</td>
111
+ <td>0.114</td>
112
+ <td>0.340</td>
113
+ </tr>
114
+ <tr>
115
+ <td>Mathpix</td>
116
+ <td>0.191</td>
117
+ <td>0.365</td>
118
+ <td>0.105</td>
119
+ <td>0.384</td>
120
+ <td>0.306</td>
121
+ <td>0.454</td>
122
+ <td>77.0</td>
123
+ <td>67.1</td>
124
+ <td>0.243</td>
125
+ <td>0.320</td>
126
+ <td>0.108</td>
127
+ <td>0.304</td>
128
+ </tr>
129
+ <tr>
130
+ <td>Docling</td>
131
+ <td>0.589</td>
132
+ <td>0.909</td>
133
+ <td>0.416</td>
134
+ <td>0.987</td>
135
+ <td>0.999</td>
136
+ <td>1</td>
137
+ <td>61.3</td>
138
+ <td>25.0</td>
139
+ <td>0.627</td>
140
+ <td>0.810</td>
141
+ <td>0.313</td>
142
+ <td>0.837</td>
143
+ </tr>
144
+ <tr>
145
+ <td>Pix2Text</td>
146
+ <td>0.320</td>
147
+ <td>0.528</td>
148
+ <td>0.138</td>
149
+ <td>0.356</td>
150
+ <td>0.276</td>
151
+ <td>0.611</td>
152
+ <td>73.6</td>
153
+ <td>66.2</td>
154
+ <td>0.584</td>
155
+ <td>0.645</td>
156
+ <td>0.281</td>
157
+ <td>0.499</td>
158
+ </tr>
159
+ <tr>
160
+ <td>Unstructured</td>
161
+ <td>0.586</td>
162
+ <td>0.716</td>
163
+ <td>0.198</td>
164
+ <td>0.481</td>
165
+ <td>0.999</td>
166
+ <td>1</td>
167
+ <td>0</td>
168
+ <td>0.06</td>
169
+ <td>1</td>
170
+ <td>0.998</td>
171
+ <td>0.145</td>
172
+ <td>0.387</td>
173
+ </tr>
174
+ <tr>
175
+ <td>OpenParse</td>
176
+ <td>0.646</td>
177
+ <td>0.814</td>
178
+ <td>0.681</td>
179
+ <td>0.974</td>
180
+ <td>0.996</td>
181
+ <td>1</td>
182
+ <td>64.8</td>
183
+ <td>27.5</td>
184
+ <td>0.284</td>
185
+ <td>0.639</td>
186
+ <td>0.595</td>
187
+ <td>0.641</td>
188
+ </tr>
189
+ <tr>
190
+ <td>PPStruct-V3</td>
191
+ <td>0.145</td>
192
+ <td>0.206</td>
193
+ <td>0.058</td>
194
+ <td>0.088</td>
195
+ <td>0.295</td>
196
+ <td>0.535</td>
197
+ <td>-</td>
198
+ <td>-</td>
199
+ <td>0.159</td>
200
+ <td>0.109</td>
201
+ <td>0.069</td>
202
+ <td>0.091</td>
203
+ </tr>
204
+ <tr>
205
+ <td rowspan="9"><strong>Expert<br>VLMs</strong></td>
206
+ <td>GOT-OCR</td>
207
+ <td>0.287</td>
208
+ <td>0.411</td>
209
+ <td>0.189</td>
210
+ <td>0.315</td>
211
+ <td>0.360</td>
212
+ <td>0.528</td>
213
+ <td>53.2</td>
214
+ <td>47.2</td>
215
+ <td>0.459</td>
216
+ <td>0.520</td>
217
+ <td>0.141</td>
218
+ <td>0.280</td>
219
+ </tr>
220
+ <tr>
221
+ <td>Nougat</td>
222
+ <td>0.452</td>
223
+ <td>0.973</td>
224
+ <td>0.365</td>
225
+ <td>0.998</td>
226
+ <td>0.488</td>
227
+ <td>0.941</td>
228
+ <td>39.9</td>
229
+ <td>0</td>
230
+ <td>0.572</td>
231
+ <td>1.000</td>
232
+ <td>0.382</td>
233
+ <td>0.954</td>
234
+ </tr>
235
+ <tr>
236
+ <td>Mistral OCR</td>
237
+ <td>0.268</td>
238
+ <td>0.439</td>
239
+ <td>0.072</td>
240
+ <td>0.325</td>
241
+ <td>0.318</td>
242
+ <td>0.495</td>
243
+ <td>75.8</td>
244
+ <td>63.6</td>
245
+ <td>0.600</td>
246
+ <td>0.650</td>
247
+ <td>0.083</td>
248
+ <td>0.284</td>
249
+ </tr>
250
+ <tr>
251
+ <td>OLMOCR-sglang</td>
252
+ <td>0.326</td>
253
+ <td>0.469</td>
254
+ <td>0.097</td>
255
+ <td>0.293</td>
256
+ <td>0.455</td>
257
+ <td>0.655</td>
258
+ <td>68.1</td>
259
+ <td>61.3</td>
260
+ <td>0.608</td>
261
+ <td>0.652</td>
262
+ <td>0.145</td>
263
+ <td>0.277</td>
264
+ </tr>
265
+ <tr>
266
+ <td>SmolDocling-256M</td>
267
+ <td>0.493</td>
268
+ <td>0.816</td>
269
+ <td>0.262</td>
270
+ <td>0.838</td>
271
+ <td>0.753</td>
272
+ <td>0.997</td>
273
+ <td>44.9</td>
274
+ <td>16.5</td>
275
+ <td>0.729</td>
276
+ <td>0.907</td>
277
+ <td>0.227</td>
278
+ <td>0.522</td>
279
+ </tr>
280
+ <tr>
281
+ <td>Dolphin</td>
282
+ <td>0.206</td>
283
+ <td>0.306</td>
284
+ <td>0.107</td>
285
+ <td>0.197</td>
286
+ <td>0.447</td>
287
+ <td>0.580</td>
288
+ <td>77.3</td>
289
+ <td>67.2</td>
290
+ <td>0.180</td>
291
+ <td>0.285</td>
292
+ <td>0.091</td>
293
+ <td>0.162</td>
294
+ </tr>
295
+ <tr>
296
+ <td>MinerU 2</td>
297
+ <td>0.139</td>
298
+ <td>0.240</td>
299
+ <td>0.047</td>
300
+ <td>0.109</td>
301
+ <td>0.297</td>
302
+ <td>0.536</td>
303
+ <td>82.5</td>
304
+ <td>79.0</td>
305
+ <td>0.141</td>
306
+ <td>0.195</td>
307
+ <td>0.069<</td>
308
+ <td>0.118</td>
309
+ </tr>
310
+ <tr>
311
+ <td>OCRFlux</td>
312
+ <td>0.195</td>
313
+ <td>0.281</td>
314
+ <td>0.064</td>
315
+ <td>0.183</td>
316
+ <td>0.379</td>
317
+ <td>0.613</td>
318
+ <td>71.6</td>
319
+ <td>81.3</td>
320
+ <td>0.253</td>
321
+ <td>0.139</td>
322
+ <td>0.086</td>
323
+ <td>0.187</td>
324
+ </tr>
325
+ <tr>
326
+ <td>MonkeyOCR-pro-3B</td>
327
+ <td>0.138</td>
328
+ <td>0.206</td>
329
+ <td>0.067</td>
330
+ <td>0.107</td>
331
+ <td><strong>0.246</strong></td>
332
+ <td>0.421</td>
333
+ <td>81.5</td>
334
+ <td>87.5</td>
335
+ <td>0.139</td>
336
+ <td>0.111</td>
337
+ <td>0.100</td>
338
+ <td>0.185</td>
339
+ </tr>
340
+ <tr>
341
+
342
+ <td rowspan="5"><strong>General<br>VLMs</strong></td>
343
+ <td>GPT4o</td>
344
+ <td>0.233</td>
345
+ <td>0.399</td>
346
+ <td>0.144</td>
347
+ <td>0.409</td>
348
+ <td>0.425</td>
349
+ <td>0.606</td>
350
+ <td>72.0</td>
351
+ <td>62.9</td>
352
+ <td>0.234</td>
353
+ <td>0.329</td>
354
+ <td>0.128</td>
355
+ <td>0.251</td>
356
+ </tr>
357
+ <tr>
358
+ <td>Qwen2-VL-72B</td>
359
+ <td>0.252</td>
360
+ <td>0.327</td>
361
+ <td>0.096</td>
362
+ <td>0.218</td>
363
+ <td>0.404</td>
364
+ <td>0.487</td>
365
+ <td>76.8</td>
366
+ <td>76.4</td>
367
+ <td>0.387</td>
368
+ <td>0.408</td>
369
+ <td>0.119</td>
370
+ <td>0.193</td>
371
+ </tr>
372
+ <tr>
373
+ <td>Qwen2.5-VL-72B</td>
374
+ <td>0.214</td>
375
+ <td>0.261</td>
376
+ <td>0.092</td>
377
+ <td>0.18</td>
378
+ <td>0.315</td>
379
+ <td>0.434</td>
380
+ <td>82.9</td>
381
+ <td>83.9</td>
382
+ <td>0.341</td>
383
+ <td>0.262</td>
384
+ <td>0.106</td>
385
+ <td>0.168</td>
386
+ </tr>
387
+ <tr>
388
+ <td>Gemini2.5-Pro</td>
389
+ <td>0.148</td>
390
+ <td>0.212</td>
391
+ <td>0.055</td>
392
+ <td>0.168</td>
393
+ <td>0.356</td>
394
+ <td>0.439</td>
395
+ <td>85.8</td>
396
+ <td>86.4</td>
397
+ <td>0.13</td>
398
+ <td>0.119</td>
399
+ <td>0.049</td>
400
+ <td>0.121</td>
401
+ </tr>
402
+ <tr>
403
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
404
+ <td>0.140</td>
405
+ <td>0.162</td>
406
+ <td>0.043</td>
407
+ <td>0.085</td>
408
+ <td>0.295</td>
409
+ <td><strong>0.384</strong></td>
410
+ <td>83.3</td>
411
+ <td><strong>89.3</strong></td>
412
+ <td>0.165</td>
413
+ <td><strong>0.085</strong></td>
414
+ <td>0.058</td>
415
+ <td>0.094</td>
416
+ </tr>
417
+ <tr>
418
+ <td rowspan="1"><strong>Expert VLMs</strong></td>
419
+ <td><strong>dots.ocr</strong></td>
420
+ <td><strong>0.125</strong></td>
421
+ <td><strong>0.160</strong></td>
422
+ <td><strong>0.032</strong></td>
423
+ <td><strong>0.066</strong></td>
424
+ <td>0.329</td>
425
+ <td>0.416</td>
426
+ <td><strong>88.6</strong></td>
427
+ <td>89.0</td>
428
+ <td><strong>0.099</strong></td>
429
+ <td>0.092</td>
430
+ <td><strong>0.040</strong></td>
431
+ <td><strong>0.067</strong></td>
432
+ </tr>
433
+ <tr>
434
+ </tbody>
435
+ </table>
436
+
437
+
438
+ #### The end-to-end text recognition performance across 9 PDF page types.
439
+
440
+ <table>
441
+ <thead>
442
+ <tr>
443
+ <th><strong>Model<br>Type</strong></th>
444
+ <th><strong>Models</strong></th>
445
+ <th><strong>Book</strong></th>
446
+ <th><strong>Slides</strong></th>
447
+ <th><strong>Financial<br>Report</strong></th>
448
+ <th><strong>Textbook</strong></th>
449
+ <th><strong>Exam<br>Paper</strong></th>
450
+ <th><strong>Magazine</strong></th>
451
+ <th><strong>Academic<br>Papers</strong></th>
452
+ <th><strong>Notes</strong></th>
453
+ <th><strong>Newspaper</strong></th>
454
+ <th><strong>Overall</strong></th>
455
+ </tr>
456
+ </thead>
457
+ <tbody>
458
+ <tr>
459
+ <td rowspan="3"><strong>Pipeline<br>Tools</strong></td>
460
+ <td>MinerU</td>
461
+ <td>0.055</td>
462
+ <td>0.124</td>
463
+ <td><u>0.033</u></td>
464
+ <td>0.102</td>
465
+ <td>0.159</td>
466
+ <td><strong>0.072</strong></td>
467
+ <td><u>0.025</u></td>
468
+ <td>0.984</td>
469
+ <td>0.171</td>
470
+ <td>0.206</td>
471
+ </tr>
472
+ <tr>
473
+ <td>Marker</td>
474
+ <td>0.074</td>
475
+ <td>0.340</td>
476
+ <td>0.089</td>
477
+ <td>0.319</td>
478
+ <td>0.452</td>
479
+ <td>0.153</td>
480
+ <td>0.059</td>
481
+ <td>0.651</td>
482
+ <td>0.192</td>
483
+ <td>0.274</td>
484
+ </tr>
485
+ <tr>
486
+ <td>Mathpix</td>
487
+ <td>0.131</td>
488
+ <td>0.220</td>
489
+ <td>0.202</td>
490
+ <td>0.216</td>
491
+ <td>0.278</td>
492
+ <td>0.147</td>
493
+ <td>0.091</td>
494
+ <td>0.634</td>
495
+ <td>0.690</td>
496
+ <td>0.300</td>
497
+ </tr>
498
+ <tr>
499
+ <td rowspan="5"><strong>Expert<br>VLMs</strong></td>
500
+ <td>GOT-OCR</td>
501
+ <td>0.111</td>
502
+ <td>0.222</td>
503
+ <td>0.067</td>
504
+ <td>0.132</td>
505
+ <td>0.204</td>
506
+ <td>0.198</td>
507
+ <td>0.179</td>
508
+ <td>0.388</td>
509
+ <td>0.771</td>
510
+ <td>0.267</td>
511
+ </tr>
512
+ <tr>
513
+ <td>Nougat</td>
514
+ <td>0.734</td>
515
+ <td>0.958</td>
516
+ <td>1.000</td>
517
+ <td>0.820</td>
518
+ <td>0.930</td>
519
+ <td>0.830</td>
520
+ <td>0.214</td>
521
+ <td>0.991</td>
522
+ <td>0.871</td>
523
+ <td>0.806</td>
524
+ </tr>
525
+ <tr>
526
+ <td>Dolphin</td>
527
+ <td>0.091</td>
528
+ <td>0.131</td>
529
+ <td>0.057</td>
530
+ <td>0.146</td>
531
+ <td>0.231</td>
532
+ <td>0.121</td>
533
+ <td>0.074</td>
534
+ <td>0.363</td>
535
+ <td>0.307</td>
536
+ <td>0.177</td>
537
+ </tr>
538
+ <tr>
539
+ <td>OCRFlux</td>
540
+ <td>0.068</td>
541
+ <td>0.125</td>
542
+ <td>0.092</td>
543
+ <td>0.102</td>
544
+ <td>0.119</td>
545
+ <td>0.083</td>
546
+ <td>0.047</td>
547
+ <td>0.223</td>
548
+ <td>0.536</td>
549
+ <td>0.149</td>
550
+ </tr>
551
+ <tr>
552
+ <td>MonkeyOCR-pro-3B</td>
553
+ <td>0.084</td>
554
+ <td>0.129</td>
555
+ <td>0.060</td>
556
+ <td>0.090</td>
557
+ <td>0.107</td>
558
+ <td>0.073</td>
559
+ <td>0.050</td>
560
+ <td>0.171</td>
561
+ <td>0.107</td>
562
+ <td>0.100</td>
563
+ </tr>
564
+ <tr>
565
+ <td rowspan="4"><strong>General<br>VLMs</strong></td>
566
+ <td>GPT4o</td>
567
+ <td>0.157</td>
568
+ <td>0.163</td>
569
+ <td>0.348</td>
570
+ <td>0.187</td>
571
+ <td>0.281</td>
572
+ <td>0.173</td>
573
+ <td>0.146</td>
574
+ <td>0.607</td>
575
+ <td>0.751</td>
576
+ <td>0.316</td>
577
+ </tr>
578
+ <tr>
579
+ <td>Qwen2.5-VL-7B</td>
580
+ <td>0.148</td>
581
+ <td>0.053</td>
582
+ <td>0.111</td>
583
+ <td>0.137</td>
584
+ <td>0.189</td>
585
+ <td>0.117</td>
586
+ <td>0.134</td>
587
+ <td>0.204</td>
588
+ <td>0.706</td>
589
+ <td>0.205</td>
590
+ </tr>
591
+ <tr>
592
+ <td>InternVL3-8B</td>
593
+ <td>0.163</td>
594
+ <td>0.056</td>
595
+ <td>0.107</td>
596
+ <td>0.109</td>
597
+ <td>0.129</td>
598
+ <td>0.100</td>
599
+ <td>0.159</td>
600
+ <td>0.150</td>
601
+ <td>0.681</td>
602
+ <td>0.188</td>
603
+ </tr>
604
+ <tr>
605
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
606
+ <td>0.048</td>
607
+ <td>0.048</td>
608
+ <td>0.024</td>
609
+ <td><strong>0.062</strong></td>
610
+ <td>0.085</td>
611
+ <td>0.051</td>
612
+ <td>0.039</td>
613
+ <td><strong>0.096</strong></td>
614
+ <td>0.181</td>
615
+ <td>0.073</td>
616
+ </tr>
617
+ <tr>
618
+ <td rowspan="1"><strong>Expert VLMs</strong></td>
619
+ <td><strong>dots.ocr</strong></td>
620
+ <td><strong>0.031</strong></td>
621
+ <td><strong>0.047</strong></td>
622
+ <td><strong>0.011</strong></td>
623
+ <td>0.082</td>
624
+ <td><strong>0.079</strong></td>
625
+ <td><strong>0.028</strong></td>
626
+ <td><strong>0.029</strong></td>
627
+ <td>0.109</td>
628
+ <td><strong>0.056</strong></td>
629
+ <td><strong>0.055</strong></td>
630
+ </tr>
631
+
632
+ </tbody>
633
+ </table>
634
+
635
+ > **Notes:**
636
+ > - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
637
+ > - We delete the Page-header and Page-footer cells in the result markdown.
638
+ > - We use tikz_preprocess pipeline to upsample the images to dpi 200.
639
+
640
+
641
+ ### 2. **dots.ocr-bench**
642
+
643
+ This is an inhouse benchmark which contain 1493 pdf images with 100 languages.
644
+
645
+ #### The end-to-end evaluation results of different tasks.
646
+
647
+ <table>
648
+ <thead>
649
+ <tr>
650
+ <th rowspan="1"><strong>Methods</strong></th>
651
+ <th colspan="1"><strong>Overall<sup>Edit</sup>↓</strong></th>
652
+ <th colspan="1"><strong>Text<sup>Edit</sup>↓</strong></th>
653
+ <th colspan="1"><strong>Formula<sup>Edit</sup>↓</strong></th>
654
+ <th colspan="1"><strong>Table<sup>TEDS</sup>↑</strong></th>
655
+ <th colspan="1"><strong>Table<sup>Edit</sup>↓</strong></th>
656
+ <th colspan="1"><strong>Read Order<sup>Edit</sup>↓</strong></th>
657
+ </tr>
658
+ </thead>
659
+ <tbody>
660
+ <td>MonkeyOCR-3B</td>
661
+ <td>0.483</td>
662
+ <td>0.445</td>
663
+ <td>0.627</td>
664
+ <td>50.93</td>
665
+ <td>0.452</td>
666
+ <td>0.409</td>
667
+ </tr>
668
+ <tr>
669
+ <td>doubao-1-5-thinking-vision-pro-250428</td>
670
+ <td>0.291</td>
671
+ <td>0.226</td>
672
+ <td>0.440</td>
673
+ <td>71.2</td>
674
+ <td>0.260</td>
675
+ <td>0.238</td>
676
+ </tr>
677
+ <tr>
678
+ <td>doubao-1-6</td>
679
+ <td>0.299</td>
680
+ <td>0.270</td>
681
+ <td>0.417</td>
682
+ <td>71.0</td>
683
+ <td>0.258</td>
684
+ <td>0.253</td>
685
+ </tr>
686
+ <tr>
687
+ <td>Gemini2.5-Pro</td>
688
+ <td>0.251</td>
689
+ <td>0.163</td>
690
+ <td>0.402</td>
691
+ <td>77.1</td>
692
+ <td>0.236</td>
693
+ <td>0.202</td>
694
+ </tr>
695
+ <tr>
696
+ <td><strong>dots.ocr</strong> </td>
697
+ <td><strong>0.177</strong></td>
698
+ <td><strong>0.075</strong></td>
699
+ <td><strong>0.297</strong></td>
700
+ <td><strong>79.2</strong></td>
701
+ <td><strong>0.186</strong></td>
702
+ <td><strong>0.152</strong></td>
703
+ </tr>
704
+
705
+ </tbody>
706
+ </table>
707
+
708
+ > **Notes:**
709
+ > - We use the same metric calculation pipeline of [OmniDocBench](https://github.com/opendatalab/OmniDocBench).
710
+ > - We delete the Page-header and Page-footer cells in the result markdown.
711
+
712
+ #### Layout Detection
713
+
714
+ <table>
715
+ <thead>
716
+ <tr>
717
+ <th rowspan="2"><strong>Method</strong></th>
718
+ <th colspan="5" style="text-align: center;"><strong>F1@IoU=.50:.05:.95↑</strong></th>
719
+ <th colspan="5" style="text-align: center;"><strong>F1@IoU=.50↑</strong></th>
720
+ </tr>
721
+ <tr>
722
+ <th>Overall</th>
723
+ <th>Text</th>
724
+ <th>Formula</th>
725
+ <th>Table</th>
726
+ <th>Picture</th>
727
+ <th>Overall</th>
728
+ <th>Text</th>
729
+ <th>Formula</th>
730
+ <th>Table</th>
731
+ <th>Picture</th>
732
+ </tr>
733
+ </thead>
734
+
735
+ <tbody>
736
+ <td>DocLayout-YOLO-DocStructBench</td>
737
+ <td>0.733</td>
738
+ <td>0.694</td>
739
+ <td>0.480</td>
740
+ <td>0.803</td>
741
+ <td>0.619</td>
742
+ <td>0.806</td>
743
+ <td>0.779</td>
744
+ <td>0.620</td>
745
+ <td>0.858</td>
746
+ <td>0.678</td>
747
+ </tr>
748
+
749
+ <tr>
750
+ <td>dots.ocr-parse all</td>
751
+ <td>0.831</td>
752
+ <td>0.801</td>
753
+ <td>0.654</td>
754
+ <td>0.838</td>
755
+ <td>0.748</td>
756
+ <td>0.922</td>
757
+ <td>0.909</td>
758
+ <td>0.770</td>
759
+ <td>0.888</td>
760
+ <td>0.831</td>
761
+ </tr>
762
+
763
+ <tr>
764
+ <td> <strong>dots.ocr-detection only</strong> </td>
765
+ <td><strong>0.845</strong></td>
766
+ <td><strong>0.816</strong></td>
767
+ <td><strong>0.716</strong></td>
768
+ <td><strong>0.875</strong></td>
769
+ <td><strong>0.765</strong></td>
770
+ <td><strong>0.930</strong></td>
771
+ <td><strong>0.917</strong></td>
772
+ <td><strong>0.832</strong></td>
773
+ <td><strong>0.918</strong></td>
774
+ <td><strong>0.843</strong></td>
775
+ </tr>
776
+
777
+ </tbody>
778
+ </table>
779
+
780
+ > **Notes:**
781
+ > - prompt_layout_all_en for **parse all**, prompt_layout_only_en for **detection only**, please refer to [prompts](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
782
+
783
+
784
+ ### 3. olmOCR-bench.
785
+
786
+ <table>
787
+ <thead>
788
+ <tr>
789
+ <th>Model</th>
790
+ <th>ArXiv</th>
791
+ <th>Old Scans<br>Math</th>
792
+ <th>Tables</th>
793
+ <th>Old Scans</th>
794
+ <th>Headers and<br>Footers</th>
795
+ <th>Multi<br>column</th>
796
+ <th>Long Tiny<br>Text</th>
797
+ <th>Base</th>
798
+ <th>Overall</th>
799
+ </tr>
800
+ </thead>
801
+ <tbody>
802
+ <tr>
803
+ <td>GOT OCR</td>
804
+ <td>52.7</td>
805
+ <td>52.0</td>
806
+ <td>0.2</td>
807
+ <td>22.1</td>
808
+ <td>93.6</td>
809
+ <td>42.0</td>
810
+ <td>29.9</td>
811
+ <td>94.0</td>
812
+ <td>48.3 ± 1.1</td>
813
+ </tr>
814
+ <tr>
815
+ <td>Marker</td>
816
+ <td>76.0</td>
817
+ <td>57.9</td>
818
+ <td>57.6</td>
819
+ <td>27.8</td>
820
+ <td>84.9</td>
821
+ <td>72.9</td>
822
+ <td>84.6</td>
823
+ <td>99.1</td>
824
+ <td>70.1 ± 1.1</td>
825
+ </tr>
826
+ <tr>
827
+ <td>MinerU</td>
828
+ <td>75.4</td>
829
+ <td>47.4</td>
830
+ <td>60.9</td>
831
+ <td>17.3</td>
832
+ <td><strong>96.6</strong></td>
833
+ <td>59.0</td>
834
+ <td>39.1</td>
835
+ <td>96.6</td>
836
+ <td>61.5 ± 1.1</td>
837
+ </tr>
838
+ <tr>
839
+ <td>Mistral OCR</td>
840
+ <td>77.2</td>
841
+ <td>67.5</td>
842
+ <td>60.6</td>
843
+ <td>29.3</td>
844
+ <td>93.6</td>
845
+ <td>71.3</td>
846
+ <td>77.1</td>
847
+ <td>99.4</td>
848
+ <td>72.0 ± 1.1</td>
849
+ </tr>
850
+ <tr>
851
+ <td>Nanonets OCR</td>
852
+ <td>67.0</td>
853
+ <td>68.6</td>
854
+ <td><strong>77.7</strong></td>
855
+ <td>39.5</td>
856
+ <td>40.7</td>
857
+ <td>69.9</td>
858
+ <td>53.4</td>
859
+ <td>99.3</td>
860
+ <td>64.5 ± 1.1</td>
861
+ </tr>
862
+ <tr>
863
+ <td>GPT-4o<br>(No Anchor)</td>
864
+ <td>51.5</td>
865
+ <td><strong>75.5</strong></td>
866
+ <td>69.1</td>
867
+ <td>40.9</td>
868
+ <td>94.2</td>
869
+ <td>68.9</td>
870
+ <td>54.1</td>
871
+ <td>96.7</td>
872
+ <td>68.9 ± 1.1</td>
873
+ </tr>
874
+ <tr>
875
+ <td>GPT-4o<br>(Anchored)</td>
876
+ <td>53.5</td>
877
+ <td>74.5</td>
878
+ <td>70.0</td>
879
+ <td>40.7</td>
880
+ <td>93.8</td>
881
+ <td>69.3</td>
882
+ <td>60.6</td>
883
+ <td>96.8</td>
884
+ <td>69.9 ± 1.1</td>
885
+ </tr>
886
+ <tr>
887
+ <td>Gemini Flash 2<br>(No Anchor)</td>
888
+ <td>32.1</td>
889
+ <td>56.3</td>
890
+ <td>61.4</td>
891
+ <td>27.8</td>
892
+ <td>48.0</td>
893
+ <td>58.7</td>
894
+ <td><strong>84.4</strong></td>
895
+ <td>94.0</td>
896
+ <td>57.8 ± 1.1</td>
897
+ </tr>
898
+ <tr>
899
+ <td>Gemini Flash 2<br>(Anchored)</td>
900
+ <td>54.5</td>
901
+ <td>56.1</td>
902
+ <td>72.1</td>
903
+ <td>34.2</td>
904
+ <td>64.7</td>
905
+ <td>61.5</td>
906
+ <td>71.5</td>
907
+ <td>95.6</td>
908
+ <td>63.8 ± 1.2</td>
909
+ </tr>
910
+ <tr>
911
+ <td>Qwen 2 VL<br>(No Anchor)</td>
912
+ <td>19.7</td>
913
+ <td>31.7</td>
914
+ <td>24.2</td>
915
+ <td>17.1</td>
916
+ <td>88.9</td>
917
+ <td>8.3</td>
918
+ <td>6.8</td>
919
+ <td>55.5</td>
920
+ <td>31.5 ± 0.9</td>
921
+ </tr>
922
+ <tr>
923
+ <td>Qwen 2.5 VL<br>(No Anchor)</td>
924
+ <td>63.1</td>
925
+ <td>65.7</td>
926
+ <td>67.3</td>
927
+ <td>38.6</td>
928
+ <td>73.6</td>
929
+ <td>68.3</td>
930
+ <td>49.1</td>
931
+ <td>98.3</td>
932
+ <td>65.5 ± 1.2</td>
933
+ </tr>
934
+ <tr>
935
+ <td>olmOCR v0.1.75<br>(No Anchor)</td>
936
+ <td>71.5</td>
937
+ <td>71.4</td>
938
+ <td>71.4</td>
939
+ <td><strong>42.8</strong></td>
940
+ <td>94.1</td>
941
+ <td>77.7</td>
942
+ <td>71.0</td>
943
+ <td>97.8</td>
944
+ <td>74.7 ± 1.1</td>
945
+ </tr>
946
+ <tr>
947
+ <td>olmOCR v0.1.75<br>(Anchored)</td>
948
+ <td>74.9</td>
949
+ <td>71.2</td>
950
+ <td>71.0</td>
951
+ <td>42.2</td>
952
+ <td>94.5</td>
953
+ <td>78.3</td>
954
+ <td>73.3</td>
955
+ <td>98.3</td>
956
+ <td>75.5 ± 1.0</td>
957
+ </tr>
958
+ <tr>
959
+ <td>MonkeyOCR-pro-3B</td>
960
+ <td><strong>83.8</strong></td>
961
+ <td>68.8</td>
962
+ <td>74.6</td>
963
+ <td>36.1</td>
964
+ <td>91.2</td>
965
+ <td>76.6</td>
966
+ <td>80.1</td>
967
+ <td>95.3</td>
968
+ <td>75.8 ± 1.0</td>
969
+ </tr>
970
+ <tr>
971
+ <td><strong>dots.ocr</strong></td>
972
+ <td>82.1</td>
973
+ <td>64.2</td>
974
+ <td><strong>88.3</strong></td>
975
+ <td>40.9</td>
976
+ <td>94.1</td>
977
+ <td><strong>82.4</strong></td>
978
+ <td>81.2</td>
979
+ <td><strong>99.5</strong></td>
980
+ <td><strong>79.1 ± 1.0</strong></td>
981
+ </tr>
982
+ </tbody>
983
+ </table>
984
+
985
+
986
+ > **Note:**
987
+ > - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
988
+ [olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
989
+ > - We delete the Page-header and Page-footer cells in the result markdown.
990
+
991
+ ## Methods
992
+
993
+ ### Pretrain
994
+
995
+ We developed a foundational Vision-Language Model (VLM) through a three-stage training process:
996
+
997
+ * **Stage1: Vision Encoder Pre-training**
998
+ We trained a 1.2-billion-parameter Vision Encoder (VE) from scratch on a vast and comprehensive dataset of image-text pairs.
999
+ * **Stage2: VE Continued Pre-training**
1000
+ We incorporated additional visual data, including OCR, video, grounding data, etc. Leveraging the `NaViT` architecture, our model supports high-resolution inputs of up to 11 million pixels. The VE was then aligned with the `Qwen2.5-1.5B` language model and trained on this diverse visual data with LLM frozen, which resulted in our general vision encoder `dots.vit`.
1001
+ * **Stage3: VLM Specialization for OCR**
1002
+ We then used a pure OCR dataset for training. To improve training efficiency, we first trained on a certain volume of tokens with the VE parameters frozen. Subsequently, we unfroze all parameters and continued training on an additional one-fifth of that token volume, which produced our foundational OCR model, `dots.ocr.base`.
1003
+
1004
+ ### SFT
1005
+
1006
+ The SFT stage was implemented on the following key strategies:
1007
+
1008
+ * **Diverse SFT Dataset:** We constructed a dataset of nearly 300,000 samples, integrating our in-house manual annotations, synthetic data (tables, formulas, multilingual OCR), as well as open-source datasets.
1009
+ * **Iterative Data Flywheel:** We employed a feedback loop to build an inhouse multilingual structured layout data with 15k samples. This process, repeated over three iterations, involved:
1010
+ * Sampling "bad cases" based on model performance.
1011
+ * Manually annotating these cases.
1012
+ * Adding them back into the training set.
1013
+ * **Reading Order:** We corrected the sequence of all layout element boxes to establish the correct reading order. This was primarily done using larger models for sorting, supplemented by rule-based post-processing methods. We found that with sufficient data diversity and quality, training the model on a list of elements sorted in their natural reading order yields excellent results.
1014
+ * **Quality and Robustness:** We build a multi-expert system for data cleaning and distillation, and applied data augmentation (resizing, rotation, noise) to improve model robustness.
1015
+ * **Multitask training:** We leveraged a single source of structured layout data to generate the SFT data with a variety of prompts. This approach enables the model to perform different tasks, such as detection and recognition, based on the specific prompt provided.
1016
+
1017
+ The resulting `dots.ocr` model demonstrates performance on par with models possessing significantly more parameters.
1018
+
1019
+
1020
+ ## Limitation & Future Work
1021
+
1022
+ - **Complex Document Elements:**
1023
+ - **Table&Formula**: dots.ocr is not yet perfect for high-complexity tables and formula extraction.
1024
+ - **Picture**: Pictures in documents are currently not parsed.
1025
+
1026
+ - **Parsing Failures:** The model may fail to parse under certain conditions:
1027
+ - When the character-to-pixel ratio is excessively high. Try enlarging the image or increasing the PDF parsing DPI (a setting of 200 is recommended). However, please note that the model performs optimally on images with a resolution under 11289600 pixels.
1028
+ - Continuous special characters, such as ellipses (`...`) and underscores (`_`), may cause the prediction output to repeat endlessly. In such scenarios, consider using alternative prompts like `prompt_layout_only_en`, `prompt_ocr`, or `prompt_grounding_ocr` ([details here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)).
1029
+
1030
+ - **Performance Bottleneck:** Despite its 1.7B parameter LLM foundation, **dots.ocr** is not yet optimized for high-throughput processing of large PDF volumes.
1031
+
1032
+ We are committed to achieving more accurate table and formula parsing, as well as enhancing the model's OCR capabilities for broader generalization, all while aiming for **a more powerful, more efficient model**. Furthermore, we are actively considering the development of **a more general-purpose perception model** based on Vision-Language Models (VLMs), which would integrate general detection, image captioning, and OCR tasks into a unified framework. **Parsing the content of the pictures in the documents** is also a key priority for our future work.
1033
+ We believe that collaboration is the key to tackling these exciting challenges. If you are passionate about advancing the frontiers of document intelligence and are interested in contributing to these future endeavors, we would love to hear from you. Please reach out to us via email at: [yanqing4@xiaohongshu.com].
1034
+
1035
+ ## Author List
1036
+
1037
+ ### Contributors
1038
+ Mi Jian, Yumeng Li, Bowen Wang, Xiaomin He, Zheyuan Gu
1039
+
1040
+ ### Project Leader
1041
+ Qing Yan
1042
+
1043
+ ### Advisor
1044
+ Colin Zhang, Lei Zhang
assets/chart.png ADDED

Git LFS Details

  • SHA256: 0576d51813061c25f36c0fcbca837fed1a1d8e06042f2b352be4bdc7b7b5cab1
  • Pointer size: 130 Bytes
  • Size of remote file: 64.5 kB
assets/logo.png ADDED

Git LFS Details

  • SHA256: ad0b70b18bbf2fb7ad1a838437c1c6069eeb3fdf2df42f7299ec9abeb3427ae4
  • Pointer size: 130 Bytes
  • Size of remote file: 67.2 kB
assets/showcase/Tibetan.png ADDED

Git LFS Details

  • SHA256: 97bdb98172dc2d5c6a4668188588eb15cc33ecd042f9d9b8224ea933229741ce
  • Pointer size: 132 Bytes
  • Size of remote file: 2.89 MB
assets/showcase/formula1.png ADDED

Git LFS Details

  • SHA256: 5f7196032f7c4cc6aad9112ba4edeca6e1c3b303c34828711e107f0bb6603c44
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
assets/showcase/formula2.png ADDED

Git LFS Details

  • SHA256: a6edff564ee572a17062a2356eb6d83b98fc15e8bf1544b554f62003ce3ec98b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.74 MB
assets/showcase/formula3.png ADDED

Git LFS Details

  • SHA256: 45b6331b43e3b11d0af4674f021f04c9b9e4e096cf533c8f5f8a15d46261982f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
assets/showcase/grounding.png ADDED

Git LFS Details

  • SHA256: a11a2b2feba8208820ec35c8036c1ee5c0588ce9c9010a4e9ce7901c7cb65e8a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB
assets/showcase/kannada.png ADDED

Git LFS Details

  • SHA256: 96f0d36e3e0b08029903066a931defe9ddf002e515d7c63262dcbeeb6b86b32a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.92 MB
assets/showcase/nl.png ADDED

Git LFS Details

  • SHA256: 53e3bd10e4a85b9dfdbb3fc3b192c47f9834101dc224d4d979c145a0a574c700
  • Pointer size: 132 Bytes
  • Size of remote file: 3.84 MB
assets/showcase/reading_order.png ADDED

Git LFS Details

  • SHA256: 916b8cd5833ec7bbbd896771537ab66aa96a9c7f70e52685d7df533b6b0cbd2a
  • Pointer size: 132 Bytes
  • Size of remote file: 2.9 MB
assets/showcase/russian.png ADDED

Git LFS Details

  • SHA256: 307f66b083df466e5a84b049e6d5cf8117050d6e1a612dc2b2fe7f2c0e996b9c
  • Pointer size: 132 Bytes
  • Size of remote file: 3.06 MB
assets/showcase/table1.png ADDED

Git LFS Details

  • SHA256: b0f75ef4c9a995a8cd29585dc7e9714fa9cb0e98490ededc745094e6c9dfd375
  • Pointer size: 132 Bytes
  • Size of remote file: 1.45 MB
assets/showcase/table2.png ADDED

Git LFS Details

  • SHA256: d6084dac8845096749ba98191552182b98bde72806577b693d02069a1cc91b5b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.77 MB
assets/showcase/table3.png ADDED

Git LFS Details

  • SHA256: c42c3b33230d4d00f83b41cb22a9f21511de138bbbc4ce04c62aa916eed53428
  • Pointer size: 132 Bytes
  • Size of remote file: 1.51 MB
assets/showcase/tradition_zh.png ADDED

Git LFS Details

  • SHA256: dfe7892659fdb07733ba102eeb55f2532a604194596eabb81b28d847a8127e50
  • Pointer size: 132 Bytes
  • Size of remote file: 1.87 MB
assets/showcase_origin/Tibetan.png ADDED

Git LFS Details

  • SHA256: a761e2eeb987ea3c08ade69c9ffe5781d7e9a06828a1abd474c63b7f27e6d278
  • Pointer size: 131 Bytes
  • Size of remote file: 966 kB
assets/showcase_origin/formula_1.jpg ADDED

Git LFS Details

  • SHA256: 5b01fa0b9f47e2b0de6b67e02dc869600c8d715b98a952e05868a86d958348ce
  • Pointer size: 131 Bytes
  • Size of remote file: 677 kB
assets/showcase_origin/formula_2.jpg ADDED

Git LFS Details

  • SHA256: 322ec389bcd88e6870ffb91ccf5ca6b667b02b5c129f44e3c6e93877e7f95800
  • Pointer size: 131 Bytes
  • Size of remote file: 300 kB
assets/showcase_origin/formula_3.jpg ADDED

Git LFS Details

  • SHA256: e47451f351abdd184f8bda270e8fba08cb1e739157584d064d9245e4fbf29247
  • Pointer size: 131 Bytes
  • Size of remote file: 269 kB
assets/showcase_origin/kannada.jpg ADDED

Git LFS Details

  • SHA256: dad7aefe09cb39d7db21cd9e1c86c6fd47a2775e55b2fbe087ebdc2f44f0ab9f
  • Pointer size: 131 Bytes
  • Size of remote file: 456 kB
assets/showcase_origin/nl.png ADDED

Git LFS Details

  • SHA256: aabb798d409851fb0fee59f3152354827fc633c5f9103a6ae130e6849e4c6030
  • Pointer size: 132 Bytes
  • Size of remote file: 1.15 MB
assets/showcase_origin/reading_order.png ADDED

Git LFS Details

  • SHA256: ebf62f427254a527d917b2d7acb3e68f7a6881277ffa382192e584508a84ca91
  • Pointer size: 131 Bytes
  • Size of remote file: 689 kB
assets/showcase_origin/russian.png ADDED

Git LFS Details

  • SHA256: 46e1e851f18e67153291b0608563eb98095975e1a9b0e23aa7a2308e229fdf49
  • Pointer size: 132 Bytes
  • Size of remote file: 1.8 MB
assets/showcase_origin/table_1.jpg ADDED

Git LFS Details

  • SHA256: 90345584ccc2c4a883779e5d47693276e8cf3fe752700af4f03b3142ab46cfa2
  • Pointer size: 131 Bytes
  • Size of remote file: 773 kB
assets/showcase_origin/table_2.jpg ADDED

Git LFS Details

  • SHA256: 308a117b9293b92ca11f2ead9d2bca58df39c435e53d50e7a78785785041acf1
  • Pointer size: 131 Bytes
  • Size of remote file: 942 kB
assets/showcase_origin/table_3.jpg ADDED

Git LFS Details

  • SHA256: 4542239b141f27f85006b1ec533e671e6e338ed4e18430b5974aa7a2d1105fef
  • Pointer size: 132 Bytes
  • Size of remote file: 2.06 MB
assets/showcase_origin/tradition_zh.png ADDED

Git LFS Details

  • SHA256: 318d5e7b11b0569deb0021a057cd8068d5e1b16ce50dfa2e8628998b1b5a448d
  • Pointer size: 131 Bytes
  • Size of remote file: 960 kB
assets/wechat.png ADDED

Git LFS Details

  • SHA256: c2208f35514007740f9b1efc1f738f0735095f5d6cd79b47eb7fac63bc7a0941
  • Pointer size: 131 Bytes
  • Size of remote file: 593 kB
demo/demo_colab_remote_server.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
demo/demo_gradio.py ADDED
@@ -0,0 +1,726 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layout Inference Web Application with Gradio
3
+
4
+ A Gradio-based layout inference tool that supports image uploads and multiple backend inference engines.
5
+ It adopts a reference-style interface design while preserving the original inference logic.
6
+ """
7
+
8
+ import gradio as gr
9
+ import json
10
+ import os
11
+ import io
12
+ import tempfile
13
+ import base64
14
+ import zipfile
15
+ import uuid
16
+ import re
17
+ from pathlib import Path
18
+ from PIL import Image
19
+ import requests
20
+ import shutil # Import shutil for cleanup
21
+
22
+ # Local tool imports
23
+ from dots_ocr.utils import dict_promptmode_to_prompt
24
+ from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
25
+ from dots_ocr.utils.demo_utils.display import read_image
26
+ from dots_ocr.utils.doc_utils import load_images_from_pdf
27
+
28
+ # Add DotsOCRParser import
29
+ from dots_ocr.parser import DotsOCRParser
30
+
31
+
32
+ # ==================== Configuration ====================
33
+ DEFAULT_CONFIG = {
34
+ 'ip': "127.0.0.1",
35
+ 'port_vllm': 8000,
36
+ 'min_pixels': MIN_PIXELS,
37
+ 'max_pixels': MAX_PIXELS,
38
+ 'test_images_dir': "./assets/showcase_origin",
39
+ }
40
+
41
+ # ==================== Global Variables ====================
42
+ # Store current configuration
43
+ current_config = DEFAULT_CONFIG.copy()
44
+
45
+ # Create DotsOCRParser instance
46
+ dots_parser = DotsOCRParser(
47
+ ip=DEFAULT_CONFIG['ip'],
48
+ port=DEFAULT_CONFIG['port_vllm'],
49
+ dpi=200,
50
+ min_pixels=DEFAULT_CONFIG['min_pixels'],
51
+ max_pixels=DEFAULT_CONFIG['max_pixels']
52
+ )
53
+
54
+ def get_initial_session_state():
55
+ return {
56
+ 'processing_results': {
57
+ 'original_image': None,
58
+ 'processed_image': None,
59
+ 'layout_result': None,
60
+ 'markdown_content': None,
61
+ 'cells_data': None,
62
+ 'temp_dir': None,
63
+ 'session_id': None,
64
+ 'result_paths': None,
65
+ 'pdf_results': None
66
+ },
67
+ 'pdf_cache': {
68
+ "images": [],
69
+ "current_page": 0,
70
+ "total_pages": 0,
71
+ "file_type": None,
72
+ "is_parsed": False,
73
+ "results": []
74
+ }
75
+ }
76
+
77
+ def read_image_v2(img):
78
+ """Reads an image, supports URLs and local paths"""
79
+ if isinstance(img, str) and img.startswith(("http://", "https://")):
80
+ with requests.get(img, stream=True) as response:
81
+ response.raise_for_status()
82
+ img = Image.open(io.BytesIO(response.content))
83
+ elif isinstance(img, str):
84
+ img, _, _ = read_image(img, use_native=True)
85
+ elif isinstance(img, Image.Image):
86
+ pass
87
+ else:
88
+ raise ValueError(f"Invalid image type: {type(img)}")
89
+ return img
90
+
91
+ def load_file_for_preview(file_path, session_state):
92
+ """Loads a file for preview, supports PDF and image files"""
93
+ pdf_cache = session_state['pdf_cache']
94
+
95
+ if not file_path or not os.path.exists(file_path):
96
+ return None, "<div id='page_info_box'>0 / 0</div>", session_state
97
+
98
+ file_ext = os.path.splitext(file_path)[1].lower()
99
+
100
+ try:
101
+ if file_ext == '.pdf':
102
+ pages = load_images_from_pdf(file_path)
103
+ pdf_cache["file_type"] = "pdf"
104
+ elif file_ext in ['.jpg', '.jpeg', '.png']:
105
+ image = Image.open(file_path)
106
+ pages = [image]
107
+ pdf_cache["file_type"] = "image"
108
+ else:
109
+ return None, "<div id='page_info_box'>Unsupported file format</div>", session_state
110
+ except Exception as e:
111
+ return None, f"<div id='page_info_box'>PDF loading failed: {str(e)}</div>", session_state
112
+
113
+ pdf_cache["images"] = pages
114
+ pdf_cache["current_page"] = 0
115
+ pdf_cache["total_pages"] = len(pages)
116
+ pdf_cache["is_parsed"] = False
117
+ pdf_cache["results"] = []
118
+
119
+ return pages[0], f"<div id='page_info_box'>1 / {len(pages)}</div>", session_state
120
+
121
+ def turn_page(direction, session_state):
122
+ """Page turning function"""
123
+ pdf_cache = session_state['pdf_cache']
124
+
125
+ if not pdf_cache["images"]:
126
+ return None, "<div id='page_info_box'>0 / 0</div>", "", session_state
127
+
128
+ if direction == "prev":
129
+ pdf_cache["current_page"] = max(0, pdf_cache["current_page"] - 1)
130
+ elif direction == "next":
131
+ pdf_cache["current_page"] = min(pdf_cache["total_pages"] - 1, pdf_cache["current_page"] + 1)
132
+
133
+ index = pdf_cache["current_page"]
134
+ current_image = pdf_cache["images"][index] # Use the original image by default
135
+ page_info = f"<div id='page_info_box'>{index + 1} / {pdf_cache['total_pages']}</div>"
136
+
137
+ current_json = ""
138
+ if pdf_cache["is_parsed"] and index < len(pdf_cache["results"]):
139
+ result = pdf_cache["results"][index]
140
+ if 'cells_data' in result and result['cells_data']:
141
+ try:
142
+ current_json = json.dumps(result['cells_data'], ensure_ascii=False, indent=2)
143
+ except:
144
+ current_json = str(result.get('cells_data', ''))
145
+ if 'layout_image' in result and result['layout_image']:
146
+ current_image = result['layout_image']
147
+
148
+ return current_image, page_info, current_json, session_state
149
+
150
+ def get_test_images():
151
+ """Gets the list of test images"""
152
+ test_images = []
153
+ test_dir = current_config['test_images_dir']
154
+ if os.path.exists(test_dir):
155
+ test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)
156
+ if name.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf'))]
157
+ return test_images
158
+
159
+ def create_temp_session_dir():
160
+ """Creates a unique temporary directory for each processing request"""
161
+ session_id = uuid.uuid4().hex[:8]
162
+ temp_dir = os.path.join(tempfile.gettempdir(), f"dots_ocr_demo_{session_id}")
163
+ os.makedirs(temp_dir, exist_ok=True)
164
+ return temp_dir, session_id
165
+
166
+ def parse_image_with_high_level_api(parser, image, prompt_mode, fitz_preprocess=False):
167
+ """
168
+ Processes using the high-level API parse_image from DotsOCRParser
169
+ """
170
+ # Create a temporary session directory
171
+ temp_dir, session_id = create_temp_session_dir()
172
+
173
+ try:
174
+ # Save the PIL Image as a temporary file
175
+ temp_image_path = os.path.join(temp_dir, f"input_{session_id}.png")
176
+ image.save(temp_image_path, "PNG")
177
+
178
+ # Use the high-level API parse_image
179
+ filename = f"demo_{session_id}"
180
+ results = parser.parse_image(
181
+ input_path=image,
182
+ filename=filename,
183
+ prompt_mode=prompt_mode,
184
+ save_dir=temp_dir,
185
+ fitz_preprocess=fitz_preprocess
186
+ )
187
+
188
+ # Parse the results
189
+ if not results:
190
+ raise ValueError("No results returned from parser")
191
+
192
+ result = results[0] # parse_image returns a list with a single result
193
+
194
+ layout_image = None
195
+ if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
196
+ layout_image = Image.open(result['layout_image_path'])
197
+
198
+ cells_data = None
199
+ if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
200
+ with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
201
+ cells_data = json.load(f)
202
+
203
+ md_content = None
204
+ if 'md_content_path' in result and os.path.exists(result['md_content_path']):
205
+ with open(result['md_content_path'], 'r', encoding='utf-8') as f:
206
+ md_content = f.read()
207
+
208
+ return {
209
+ 'layout_image': layout_image,
210
+ 'cells_data': cells_data,
211
+ 'md_content': md_content,
212
+ 'filtered': result.get('filtered', False),
213
+ 'temp_dir': temp_dir,
214
+ 'session_id': session_id,
215
+ 'result_paths': result,
216
+ 'input_width': result.get('input_width', 0),
217
+ 'input_height': result.get('input_height', 0),
218
+ }
219
+ except Exception as e:
220
+ if os.path.exists(temp_dir):
221
+ shutil.rmtree(temp_dir, ignore_errors=True)
222
+ raise e
223
+
224
+ def parse_pdf_with_high_level_api(parser, pdf_path, prompt_mode):
225
+ """
226
+ Processes using the high-level API parse_pdf from DotsOCRParser
227
+ """
228
+ # Create a temporary session directory
229
+ temp_dir, session_id = create_temp_session_dir()
230
+
231
+ try:
232
+ # Use the high-level API parse_pdf
233
+ filename = f"demo_{session_id}"
234
+ results = parser.parse_pdf(
235
+ input_path=pdf_path,
236
+ filename=filename,
237
+ prompt_mode=prompt_mode,
238
+ save_dir=temp_dir
239
+ )
240
+
241
+ # Parse the results
242
+ if not results:
243
+ raise ValueError("No results returned from parser")
244
+
245
+ # Handle multi-page results
246
+ parsed_results = []
247
+ all_md_content = []
248
+ all_cells_data = []
249
+
250
+ for i, result in enumerate(results):
251
+ page_result = {
252
+ 'page_no': result.get('page_no', i),
253
+ 'layout_image': None,
254
+ 'cells_data': None,
255
+ 'md_content': None,
256
+ 'filtered': False
257
+ }
258
+
259
+ # Read the layout image
260
+ if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
261
+ page_result['layout_image'] = Image.open(result['layout_image_path'])
262
+
263
+ # Read the JSON data
264
+ if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
265
+ with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
266
+ page_result['cells_data'] = json.load(f)
267
+ all_cells_data.extend(page_result['cells_data'])
268
+
269
+ # Read the Markdown content
270
+ if 'md_content_path' in result and os.path.exists(result['md_content_path']):
271
+ with open(result['md_content_path'], 'r', encoding='utf-8') as f:
272
+ page_content = f.read()
273
+ page_result['md_content'] = page_content
274
+ all_md_content.append(page_content)
275
+ page_result['filtered'] = result.get('filtered', False)
276
+ parsed_results.append(page_result)
277
+
278
+ combined_md = "\n\n---\n\n".join(all_md_content) if all_md_content else ""
279
+ return {
280
+ 'parsed_results': parsed_results,
281
+ 'combined_md_content': combined_md,
282
+ 'combined_cells_data': all_cells_data,
283
+ 'temp_dir': temp_dir,
284
+ 'session_id': session_id,
285
+ 'total_pages': len(results)
286
+ }
287
+
288
+ except Exception as e:
289
+ if os.path.exists(temp_dir):
290
+ shutil.rmtree(temp_dir, ignore_errors=True)
291
+ raise e
292
+
293
+ # ==================== Core Processing Function ====================
294
+ def process_image_inference(session_state, test_image_input, file_input,
295
+ prompt_mode, server_ip, server_port, min_pixels, max_pixels,
296
+ fitz_preprocess=False
297
+ ):
298
+ """Core function to handle image/PDF inference"""
299
+ # Use session_state instead of global variables
300
+ processing_results = session_state['processing_results']
301
+ pdf_cache = session_state['pdf_cache']
302
+
303
+ if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
304
+ try:
305
+ shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
306
+ except Exception as e:
307
+ print(f"Failed to clean up previous temporary directory: {e}")
308
+
309
+ # Reset processing results for the current session
310
+ session_state['processing_results'] = get_initial_session_state()['processing_results']
311
+ processing_results = session_state['processing_results']
312
+
313
+ current_config.update({
314
+ 'ip': server_ip,
315
+ 'port_vllm': server_port,
316
+ 'min_pixels': min_pixels,
317
+ 'max_pixels': max_pixels
318
+ })
319
+
320
+ # Update parser configuration
321
+ dots_parser.ip = server_ip
322
+ dots_parser.port = server_port
323
+ dots_parser.min_pixels = min_pixels
324
+ dots_parser.max_pixels = max_pixels
325
+
326
+ input_file_path = file_input if file_input else test_image_input
327
+
328
+ if not input_file_path:
329
+ return None, "Please upload image/PDF file or select test image", "", "", gr.update(value=None), None, "", session_state
330
+
331
+ file_ext = os.path.splitext(input_file_path)[1].lower()
332
+
333
+ try:
334
+ if file_ext == '.pdf':
335
+ # MINIMAL CHANGE: The `process_pdf_file` function is now inlined and uses session_state.
336
+ preview_image, page_info, session_state = load_file_for_preview(input_file_path, session_state)
337
+ pdf_result = parse_pdf_with_high_level_api(dots_parser, input_file_path, prompt_mode)
338
+
339
+ session_state['pdf_cache']["is_parsed"] = True
340
+ session_state['pdf_cache']["results"] = pdf_result['parsed_results']
341
+
342
+ processing_results.update({
343
+ 'markdown_content': pdf_result['combined_md_content'],
344
+ 'cells_data': pdf_result['combined_cells_data'],
345
+ 'temp_dir': pdf_result['temp_dir'],
346
+ 'session_id': pdf_result['session_id'],
347
+ 'pdf_results': pdf_result['parsed_results']
348
+ })
349
+
350
+ total_elements = len(pdf_result['combined_cells_data'])
351
+ info_text = f"**PDF Information:**\n- Total Pages: {pdf_result['total_pages']}\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Total Detected Elements: {total_elements}\n- Session ID: {pdf_result['session_id']}"
352
+
353
+ current_page_layout_image = preview_image
354
+ current_page_json = ""
355
+ if session_state['pdf_cache']["results"]:
356
+ first_result = session_state['pdf_cache']["results"][0]
357
+ if 'layout_image' in first_result and first_result['layout_image']:
358
+ current_page_layout_image = first_result['layout_image']
359
+ if first_result.get('cells_data'):
360
+ try:
361
+ current_page_json = json.dumps(first_result['cells_data'], ensure_ascii=False, indent=2)
362
+ except:
363
+ current_page_json = str(first_result['cells_data'])
364
+
365
+ download_zip_path = None
366
+ if pdf_result['temp_dir']:
367
+ download_zip_path = os.path.join(pdf_result['temp_dir'], f"layout_results_{pdf_result['session_id']}.zip")
368
+ with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
369
+ for root, _, files in os.walk(pdf_result['temp_dir']):
370
+ for file in files:
371
+ if not file.endswith('.zip'): zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), pdf_result['temp_dir']))
372
+
373
+ return (
374
+ current_page_layout_image, info_text, pdf_result['combined_md_content'] or "No markdown content generated",
375
+ pdf_result['combined_md_content'] or "No markdown content generated",
376
+ gr.update(value=download_zip_path, visible=bool(download_zip_path)), page_info, current_page_json, session_state
377
+ )
378
+
379
+ else: # Image processing
380
+ image = read_image_v2(input_file_path)
381
+ session_state['pdf_cache'] = get_initial_session_state()['pdf_cache']
382
+
383
+ original_image = image
384
+ parse_result = parse_image_with_high_level_api(dots_parser, image, prompt_mode, fitz_preprocess)
385
+
386
+ if parse_result['filtered']:
387
+ info_text = f"**Image Information:**\n- Original Size: {original_image.width} x {original_image.height}\n- Processing: JSON parsing failed, using cleaned text output\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Session ID: {parse_result['session_id']}"
388
+ processing_results.update({
389
+ 'original_image': original_image, 'markdown_content': parse_result['md_content'],
390
+ 'temp_dir': parse_result['temp_dir'], 'session_id': parse_result['session_id'],
391
+ 'result_paths': parse_result['result_paths']
392
+ })
393
+ return original_image, info_text, parse_result['md_content'], parse_result['md_content'], gr.update(visible=False), None, "", session_state
394
+
395
+ md_content_raw = parse_result['md_content'] or "No markdown content generated"
396
+ processing_results.update({
397
+ 'original_image': original_image, 'layout_result': parse_result['layout_image'],
398
+ 'markdown_content': parse_result['md_content'], 'cells_data': parse_result['cells_data'],
399
+ 'temp_dir': parse_result['temp_dir'], 'session_id': parse_result['session_id'],
400
+ 'result_paths': parse_result['result_paths']
401
+ })
402
+
403
+ num_elements = len(parse_result['cells_data']) if parse_result['cells_data'] else 0
404
+ info_text = f"**Image Information:**\n- Original Size: {original_image.width} x {original_image.height}\n- Model Input Size: {parse_result['input_width']} x {parse_result['input_height']}\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Detected {num_elements} layout elements\n- Session ID: {parse_result['session_id']}"
405
+
406
+ current_json = json.dumps(parse_result['cells_data'], ensure_ascii=False, indent=2) if parse_result['cells_data'] else ""
407
+
408
+ download_zip_path = None
409
+ if parse_result['temp_dir']:
410
+ download_zip_path = os.path.join(parse_result['temp_dir'], f"layout_results_{parse_result['session_id']}.zip")
411
+ with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
412
+ for root, _, files in os.walk(parse_result['temp_dir']):
413
+ for file in files:
414
+ if not file.endswith('.zip'): zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), parse_result['temp_dir']))
415
+
416
+ return (
417
+ parse_result['layout_image'], info_text, parse_result['md_content'] or "No markdown content generated",
418
+ md_content_raw, gr.update(value=download_zip_path, visible=bool(download_zip_path)),
419
+ None, current_json, session_state
420
+ )
421
+ except Exception as e:
422
+ import traceback
423
+ traceback.print_exc()
424
+ return None, f"Error during processing: {e}", "", "", gr.update(value=None), None, "", session_state
425
+
426
+ # MINIMAL CHANGE: Functions now take `session_state` as an argument.
427
+ def clear_all_data(session_state):
428
+ """Clears all data"""
429
+ processing_results = session_state['processing_results']
430
+
431
+ if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
432
+ try:
433
+ shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
434
+ except Exception as e:
435
+ print(f"Failed to clean up temporary directory: {e}")
436
+
437
+ # Reset the session state by returning a new initial state
438
+ new_session_state = get_initial_session_state()
439
+
440
+ return (
441
+ None, # Clear file input
442
+ "", # Clear test image selection
443
+ None, # Clear result image
444
+ "Waiting for processing results...", # Reset info display
445
+ "## Waiting for processing results...", # Reset Markdown display
446
+ "🕐 Waiting for parsing result...", # Clear raw Markdown text
447
+ gr.update(visible=False), # Hide download button
448
+ "<div id='page_info_box'>0 / 0</div>", # Reset page info
449
+ "🕐 Waiting for parsing result...", # Clear current page JSON
450
+ new_session_state
451
+ )
452
+
453
+ def update_prompt_display(prompt_mode):
454
+ """Updates the prompt display content"""
455
+ return dict_promptmode_to_prompt[prompt_mode]
456
+
457
+ # ==================== Gradio Interface ====================
458
+ def create_gradio_interface():
459
+ """Creates the Gradio interface"""
460
+
461
+ # CSS styles, matching the reference style
462
+ css = """
463
+
464
+ #parse_button {
465
+ background: #FF576D !important; /* !important 确保覆盖主题默认样式 */
466
+ border-color: #FF576D !important;
467
+ }
468
+ /* 鼠标悬停时的颜色 */
469
+ #parse_button:hover {
470
+ background: #F72C49 !important;
471
+ border-color: #F72C49 !important;
472
+ }
473
+
474
+ #page_info_html {
475
+ display: flex;
476
+ align-items: center;
477
+ justify-content: center;
478
+ height: 100%;
479
+ margin: 0 12px;
480
+ }
481
+
482
+ #page_info_box {
483
+ padding: 8px 20px;
484
+ font-size: 16px;
485
+ border: 1px solid #bbb;
486
+ border-radius: 8px;
487
+ background-color: #f8f8f8;
488
+ text-align: center;
489
+ min-width: 80px;
490
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
491
+ }
492
+
493
+ #markdown_output {
494
+ min-height: 800px;
495
+ overflow: auto;
496
+ }
497
+
498
+ footer {
499
+ visibility: hidden;
500
+ }
501
+
502
+ #info_box {
503
+ padding: 10px;
504
+ background-color: #f8f9fa;
505
+ border-radius: 8px;
506
+ border: 1px solid #dee2e6;
507
+ margin: 10px 0;
508
+ font-size: 14px;
509
+ }
510
+
511
+ #result_image {
512
+ border-radius: 8px;
513
+ }
514
+
515
+ #markdown_tabs {
516
+ height: 100%;
517
+ }
518
+ """
519
+
520
+ with gr.Blocks(theme="ocean", css=css, title='dots.ocr') as demo:
521
+ session_state = gr.State(value=get_initial_session_state())
522
+
523
+ # Title
524
+ gr.HTML("""
525
+ <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
526
+ <h1 style="margin: 0; font-size: 2em;">🔍 dots.ocr</h1>
527
+ </div>
528
+ <div style="text-align: center; margin-bottom: 10px;">
529
+ <em>Supports image/PDF layout analysis and structured output</em>
530
+ </div>
531
+ """)
532
+
533
+ with gr.Row():
534
+ # Left side: Input and Configuration
535
+ with gr.Column(scale=1, elem_id="left-panel"):
536
+ gr.Markdown("### 📥 Upload & Select")
537
+ file_input = gr.File(
538
+ label="Upload PDF/Image",
539
+ type="filepath",
540
+ file_types=[".pdf", ".jpg", ".jpeg", ".png"],
541
+ )
542
+
543
+ test_images = get_test_images()
544
+ test_image_input = gr.Dropdown(
545
+ label="Or Select an Example",
546
+ choices=[""] + test_images,
547
+ value="",
548
+ )
549
+
550
+ gr.Markdown("### ⚙️ Prompt & Actions")
551
+ prompt_mode = gr.Dropdown(
552
+ label="Select Prompt",
553
+ choices=["prompt_layout_all_en", "prompt_layout_only_en", "prompt_ocr"],
554
+ value="prompt_layout_all_en",
555
+ )
556
+
557
+ # Display current prompt content
558
+ prompt_display = gr.Textbox(
559
+ label="Current Prompt Content",
560
+ value=dict_promptmode_to_prompt[list(dict_promptmode_to_prompt.keys())[0]],
561
+ lines=4,
562
+ max_lines=8,
563
+ interactive=False,
564
+ show_copy_button=True
565
+ )
566
+
567
+ with gr.Row():
568
+ process_btn = gr.Button("🔍 Parse", variant="primary", scale=2, elem_id="parse_button")
569
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
570
+
571
+ with gr.Accordion("🛠️ Advanced Configuration", open=False):
572
+ fitz_preprocess = gr.Checkbox(
573
+ label="Enable fitz_preprocess for images",
574
+ value=True,
575
+ info="Processes image via a PDF-like pipeline (image->pdf->200dpi image). Recommended if your image DPI is low."
576
+ )
577
+ with gr.Row():
578
+ server_ip = gr.Textbox(label="Server IP", value=DEFAULT_CONFIG['ip'])
579
+ server_port = gr.Number(label="Port", value=DEFAULT_CONFIG['port_vllm'], precision=0)
580
+ with gr.Row():
581
+ min_pixels = gr.Number(label="Min Pixels", value=DEFAULT_CONFIG['min_pixels'], precision=0)
582
+ max_pixels = gr.Number(label="Max Pixels", value=DEFAULT_CONFIG['max_pixels'], precision=0)
583
+ # Right side: Result Display
584
+ with gr.Column(scale=6, variant="compact"):
585
+ with gr.Row():
586
+ # Result Image
587
+ with gr.Column(scale=3):
588
+ gr.Markdown("### 👁️ File Preview")
589
+ result_image = gr.Image(
590
+ label="Layout Preview",
591
+ visible=True,
592
+ height=800,
593
+ show_label=False
594
+ )
595
+
596
+ # Page navigation (shown during PDF preview)
597
+ with gr.Row():
598
+ prev_btn = gr.Button("⬅ Previous", size="sm")
599
+ page_info = gr.HTML(
600
+ value="<div id='page_info_box'>0 / 0</div>",
601
+ elem_id="page_info_html"
602
+ )
603
+ next_btn = gr.Button("Next ➡", size="sm")
604
+
605
+ # Info Display
606
+ info_display = gr.Markdown(
607
+ "Waiting for processing results...",
608
+ elem_id="info_box"
609
+ )
610
+
611
+ # Markdown Result
612
+ with gr.Column(scale=3):
613
+ gr.Markdown("### ✔️ Result Display")
614
+
615
+ with gr.Tabs(elem_id="markdown_tabs"):
616
+ with gr.TabItem("Markdown Render Preview"):
617
+ md_output = gr.Markdown(
618
+ "## Please click the parse button to parse or select for single-task recognition...",
619
+ max_height=600,
620
+ latex_delimiters=[
621
+ {"left": "$$", "right": "$$", "display": True},
622
+ {"left": "$", "right": "$", "display": False}
623
+ ],
624
+ show_copy_button=False,
625
+ elem_id="markdown_output"
626
+ )
627
+
628
+ with gr.TabItem("Markdown Raw Text"):
629
+ md_raw_output = gr.Textbox(
630
+ value="🕐 Waiting for parsing result...",
631
+ label="Markdown Raw Text",
632
+ max_lines=100,
633
+ lines=38,
634
+ show_copy_button=True,
635
+ elem_id="markdown_output",
636
+ show_label=False
637
+ )
638
+
639
+ with gr.TabItem("Current Page JSON"):
640
+ current_page_json = gr.Textbox(
641
+ value="🕐 Waiting for parsing result...",
642
+ label="Current Page JSON",
643
+ max_lines=100,
644
+ lines=38,
645
+ show_copy_button=True,
646
+ elem_id="markdown_output",
647
+ show_label=False
648
+ )
649
+
650
+ # Download Button
651
+ with gr.Row():
652
+ download_btn = gr.DownloadButton(
653
+ "⬇️ Download Results",
654
+ visible=False
655
+ )
656
+
657
+ # When the prompt mode changes, update the display content
658
+ prompt_mode.change(
659
+ fn=update_prompt_display,
660
+ inputs=prompt_mode,
661
+ outputs=prompt_display,
662
+ )
663
+
664
+ # Show preview on file upload
665
+ file_input.upload(
666
+ # fn=lambda file_data, state: load_file_for_preview(file_data, state),
667
+ fn=load_file_for_preview,
668
+ inputs=[file_input, session_state],
669
+ outputs=[result_image, page_info, session_state]
670
+ )
671
+
672
+ # Also handle test image selection
673
+ test_image_input.change(
674
+ # fn=lambda path, state: load_file_for_preview(path, state),
675
+ fn=load_file_for_preview,
676
+ inputs=[test_image_input, session_state],
677
+ outputs=[result_image, page_info, session_state]
678
+ )
679
+
680
+ prev_btn.click(
681
+ fn=lambda s: turn_page("prev", s),
682
+ inputs=[session_state],
683
+ outputs=[result_image, page_info, current_page_json, session_state]
684
+ )
685
+
686
+ next_btn.click(
687
+ fn=lambda s: turn_page("next", s),
688
+ inputs=[session_state],
689
+ outputs=[result_image, page_info, current_page_json, session_state]
690
+ )
691
+
692
+ process_btn.click(
693
+ fn=process_image_inference,
694
+ inputs=[
695
+ session_state, test_image_input, file_input,
696
+ prompt_mode, server_ip, server_port, min_pixels, max_pixels,
697
+ fitz_preprocess
698
+ ],
699
+ outputs=[
700
+ result_image, info_display, md_output, md_raw_output,
701
+ download_btn, page_info, current_page_json, session_state
702
+ ]
703
+ )
704
+
705
+ clear_btn.click(
706
+ fn=clear_all_data,
707
+ inputs=[session_state],
708
+ outputs=[
709
+ file_input, test_image_input,
710
+ result_image, info_display, md_output, md_raw_output,
711
+ download_btn, page_info, current_page_json, session_state
712
+ ]
713
+ )
714
+
715
+ return demo
716
+
717
+ # ==================== Main Program ====================
718
+ if __name__ == "__main__":
719
+ import sys
720
+ port = int(sys.argv[1])
721
+ demo = create_gradio_interface()
722
+ demo.queue().launch(
723
+ server_name="0.0.0.0",
724
+ server_port=port,
725
+ debug=True
726
+ )
demo/demo_gradio_annotion.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layout Inference Web Application with Gradio - Annotation Version
3
+
4
+ A Gradio-based layout inference tool that supports image uploads and multiple backend inference engines.
5
+ This version adds an image annotation feature, allowing users to draw bounding boxes on an image and send both the image and the boxes to the model.
6
+ """
7
+
8
+ import gradio as gr
9
+ import json
10
+ import os
11
+ import io
12
+ import tempfile
13
+ import base64
14
+ import zipfile
15
+ import uuid
16
+ import re
17
+ from pathlib import Path
18
+ from PIL import Image
19
+ import requests
20
+ from gradio_image_annotation import image_annotator
21
+
22
+ # Local utility imports
23
+ from dots_ocr.utils import dict_promptmode_to_prompt
24
+ from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
25
+ from dots_ocr.utils.demo_utils.display import read_image
26
+ from dots_ocr.utils.doc_utils import load_images_from_pdf
27
+
28
+ # Add DotsOCRParser import
29
+ from dots_ocr.parser import DotsOCRParser
30
+
31
+ # ==================== Configuration ====================
32
+ DEFAULT_CONFIG = {
33
+ 'ip': "127.0.0.1",
34
+ 'port_vllm': 8000,
35
+ 'min_pixels': MIN_PIXELS,
36
+ 'max_pixels': MAX_PIXELS,
37
+ 'test_images_dir': "./assets/showcase_origin",
38
+ }
39
+
40
+ # ==================== Global Variables ====================
41
+ # Store the current configuration
42
+ current_config = DEFAULT_CONFIG.copy()
43
+
44
+ # Create a DotsOCRParser instance
45
+ dots_parser = DotsOCRParser(
46
+ ip=DEFAULT_CONFIG['ip'],
47
+ port=DEFAULT_CONFIG['port_vllm'],
48
+ dpi=200,
49
+ min_pixels=DEFAULT_CONFIG['min_pixels'],
50
+ max_pixels=DEFAULT_CONFIG['max_pixels']
51
+ )
52
+
53
+ # Store processing results
54
+ processing_results = {
55
+ 'original_image': None,
56
+ 'processed_image': None,
57
+ 'layout_result': None,
58
+ 'markdown_content': None,
59
+ 'cells_data': None,
60
+ 'temp_dir': None,
61
+ 'session_id': None,
62
+ 'result_paths': None,
63
+ 'annotation_data': None # Store annotation data
64
+ }
65
+
66
+ # ==================== Utility Functions ====================
67
+ def read_image_v2(img):
68
+ """Reads an image, supporting URLs and local paths."""
69
+ if isinstance(img, str) and img.startswith(("http://", "https://")):
70
+ with requests.get(img, stream=True) as response:
71
+ response.raise_for_status()
72
+ img = Image.open(io.BytesIO(response.content))
73
+ elif isinstance(img, str):
74
+ img, _, _ = read_image(img, use_native=True)
75
+ elif isinstance(img, Image.Image):
76
+ pass
77
+ else:
78
+ raise ValueError(f"Invalid image type: {type(img)}")
79
+ return img
80
+
81
+ def get_test_images():
82
+ """Gets the list of test images."""
83
+ test_images = []
84
+ test_dir = current_config['test_images_dir']
85
+ if os.path.exists(test_dir):
86
+ test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)
87
+ if name.lower().endswith(('.png', '.jpg', '.jpeg'))]
88
+ return test_images
89
+
90
+ def create_temp_session_dir():
91
+ """Creates a unique temporary directory for each processing request."""
92
+ session_id = uuid.uuid4().hex[:8]
93
+ temp_dir = os.path.join(tempfile.gettempdir(), f"dots_ocr_demo_{session_id}")
94
+ os.makedirs(temp_dir, exist_ok=True)
95
+ return temp_dir, session_id
96
+
97
+ def parse_image_with_bbox(parser, image, prompt_mode, bbox=None, fitz_preprocess=False):
98
+ """
99
+ Processes an image using DotsOCRParser, with support for the bbox parameter.
100
+ """
101
+ # Create a temporary session directory
102
+ temp_dir, session_id = create_temp_session_dir()
103
+
104
+ try:
105
+ # Save the PIL Image to a temporary file
106
+ temp_image_path = os.path.join(temp_dir, f"input_{session_id}.png")
107
+ image.save(temp_image_path, "PNG")
108
+
109
+ # Use the high-level parse_image interface, passing the bbox parameter
110
+ filename = f"demo_{session_id}"
111
+ results = parser.parse_image(
112
+ input_path=temp_image_path,
113
+ filename=filename,
114
+ prompt_mode=prompt_mode,
115
+ save_dir=temp_dir,
116
+ bbox=bbox,
117
+ fitz_preprocess=fitz_preprocess
118
+ )
119
+
120
+ # Parse the results
121
+ if not results:
122
+ raise ValueError("No results returned from parser")
123
+
124
+ result = results[0] # parse_image returns a list with a single result
125
+
126
+ # Read the result files
127
+ layout_image = None
128
+ cells_data = None
129
+ md_content = None
130
+ filtered = False
131
+
132
+ # Read the layout image
133
+ if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
134
+ layout_image = Image.open(result['layout_image_path'])
135
+
136
+ # Read the JSON data
137
+ if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
138
+ with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
139
+ cells_data = json.load(f)
140
+
141
+ # Read the Markdown content
142
+ if 'md_content_path' in result and os.path.exists(result['md_content_path']):
143
+ with open(result['md_content_path'], 'r', encoding='utf-8') as f:
144
+ md_content = f.read()
145
+
146
+ # Check for the original response file (if JSON parsing fails)
147
+ if 'filtered' in result:
148
+ filtered = result['filtered']
149
+
150
+ return {
151
+ 'layout_image': layout_image,
152
+ 'cells_data': cells_data,
153
+ 'md_content': md_content,
154
+ 'filtered': filtered,
155
+ 'temp_dir': temp_dir,
156
+ 'session_id': session_id,
157
+ 'result_paths': result
158
+ }
159
+
160
+ except Exception as e:
161
+ # Clean up the temporary directory on error
162
+ import shutil
163
+ if os.path.exists(temp_dir):
164
+ shutil.rmtree(temp_dir, ignore_errors=True)
165
+ raise e
166
+
167
+ def process_annotation_data(annotation_data):
168
+ """Processes annotation data, converting it to the format required by the model."""
169
+ if not annotation_data or not annotation_data.get('boxes'):
170
+ return None, None
171
+
172
+ # Get image and box data
173
+ image = annotation_data.get('image')
174
+ boxes = annotation_data.get('boxes', [])
175
+
176
+ if not boxes:
177
+ return image, None
178
+
179
+ # Ensure the image is in PIL Image format
180
+ if image is not None:
181
+ import numpy as np
182
+ if isinstance(image, np.ndarray):
183
+ image = Image.fromarray(image)
184
+ elif not isinstance(image, Image.Image):
185
+ # If it's another format, try to convert it
186
+ try:
187
+ image = Image.open(image) if isinstance(image, str) else Image.fromarray(image)
188
+ except Exception as e:
189
+ print(f"Image format conversion failed: {e}")
190
+ return None, None
191
+
192
+ # Get the coordinate information of the box (only one box)
193
+ box = boxes[0]
194
+ bbox = [box['xmin'], box['ymin'], box['xmax'], box['ymax']]
195
+
196
+ return image, bbox
197
+
198
+ # ==================== Core Processing Function ====================
199
+ def process_image_inference_with_annotation(annotation_data, test_image_input,
200
+ prompt_mode, server_ip, server_port, min_pixels, max_pixels,
201
+ fitz_preprocess=False
202
+ ):
203
+ """Core function for image inference, supporting annotation data."""
204
+ global current_config, processing_results, dots_parser
205
+
206
+ # First, clean up previous processing results
207
+ if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
208
+ import shutil
209
+ try:
210
+ shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
211
+ except Exception as e:
212
+ print(f"Failed to clean up previous temporary directory: {e}")
213
+
214
+ # Reset processing results
215
+ processing_results = {
216
+ 'original_image': None,
217
+ 'processed_image': None,
218
+ 'layout_result': None,
219
+ 'markdown_content': None,
220
+ 'cells_data': None,
221
+ 'temp_dir': None,
222
+ 'session_id': None,
223
+ 'result_paths': None,
224
+ 'annotation_data': annotation_data
225
+ }
226
+
227
+ # Update configuration
228
+ current_config.update({
229
+ 'ip': server_ip,
230
+ 'port_vllm': server_port,
231
+ 'min_pixels': min_pixels,
232
+ 'max_pixels': max_pixels
233
+ })
234
+
235
+ # Update parser configuration
236
+ dots_parser.ip = server_ip
237
+ dots_parser.port = server_port
238
+ dots_parser.min_pixels = min_pixels
239
+ dots_parser.max_pixels = max_pixels
240
+
241
+ # Determine the input source and process annotation data
242
+ image = None
243
+ bbox = None
244
+
245
+ # Prioritize processing annotation data
246
+ if annotation_data and annotation_data.get('image') is not None:
247
+ image, bbox = process_annotation_data(annotation_data)
248
+ if image is not None:
249
+ # If there's a bbox, force the use of 'prompt_grounding_ocr' mode
250
+ assert bbox is not None
251
+ prompt_mode = "prompt_grounding_ocr"
252
+
253
+ # If there's no annotation data, check the test image input
254
+ if image is None and test_image_input and test_image_input != "":
255
+ try:
256
+ image = read_image_v2(test_image_input)
257
+ except Exception as e:
258
+ return None, f"Failed to read test image: {e}", "", "", gr.update(value=None), ""
259
+
260
+ if image is None:
261
+ return None, "Please select a test image or add an image in the annotation component", "", "", gr.update(value=None), ""
262
+ if bbox is None:
263
+ return "Please select a bounding box by mouse", "Please select a bounding box by mouse", "", "", gr.update(value=None)
264
+
265
+ try:
266
+ # Process using DotsOCRParser, passing the bbox parameter
267
+ original_image = image
268
+ parse_result = parse_image_with_bbox(dots_parser, image, prompt_mode, bbox, fitz_preprocess)
269
+
270
+ # Extract parsing results
271
+ layout_image = parse_result['layout_image']
272
+ cells_data = parse_result['cells_data']
273
+ md_content = parse_result['md_content']
274
+ filtered = parse_result['filtered']
275
+
276
+ # Store the results
277
+ processing_results.update({
278
+ 'original_image': original_image,
279
+ 'processed_image': None,
280
+ 'layout_result': layout_image,
281
+ 'markdown_content': md_content,
282
+ 'cells_data': cells_data,
283
+ 'temp_dir': parse_result['temp_dir'],
284
+ 'session_id': parse_result['session_id'],
285
+ 'result_paths': parse_result['result_paths'],
286
+ 'annotation_data': annotation_data
287
+ })
288
+
289
+ # Handle the case where parsing fails
290
+ if filtered:
291
+ info_text = f"""
292
+ **Image Information:**
293
+ - Original Dimensions: {original_image.width} x {original_image.height}
294
+ - Processing Mode: {'Region OCR' if bbox else 'Full Image OCR'}
295
+ - Processing Status: JSON parsing failed, using cleaned text output
296
+ - Server: {current_config['ip']}:{current_config['port_vllm']}
297
+ - Session ID: {parse_result['session_id']}
298
+ - Box Coordinates: {bbox if bbox else 'None'}
299
+ """
300
+
301
+ return (
302
+ md_content or "No markdown content generated",
303
+ info_text,
304
+ md_content or "No markdown content generated",
305
+ md_content or "No markdown content generated",
306
+ gr.update(visible=False),
307
+ ""
308
+ )
309
+
310
+ # Handle the case where JSON parsing succeeds
311
+ num_elements = len(cells_data) if cells_data else 0
312
+ info_text = f"""
313
+ **Image Information:**
314
+ - Original Dimensions: {original_image.width} x {original_image.height}
315
+ - Processing Mode: {'Region OCR' if bbox else 'Full Image OCR'}
316
+ - Server: {current_config['ip']}:{current_config['port_vllm']}
317
+ - Detected {num_elements} layout elements
318
+ - Session ID: {parse_result['session_id']}
319
+ - Box Coordinates: {bbox if bbox else 'None'}
320
+ """
321
+
322
+ # Current page JSON output
323
+ current_json = ""
324
+ if cells_data:
325
+ try:
326
+ current_json = json.dumps(cells_data, ensure_ascii=False, indent=2)
327
+ except:
328
+ current_json = str(cells_data)
329
+
330
+ # Create a downloadable ZIP file
331
+ download_zip_path = None
332
+ if parse_result['temp_dir']:
333
+ download_zip_path = os.path.join(parse_result['temp_dir'], f"layout_results_{parse_result['session_id']}.zip")
334
+ try:
335
+ with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
336
+ for root, dirs, files in os.walk(parse_result['temp_dir']):
337
+ for file in files:
338
+ if file.endswith('.zip'):
339
+ continue
340
+ file_path = os.path.join(root, file)
341
+ arcname = os.path.relpath(file_path, parse_result['temp_dir'])
342
+ zipf.write(file_path, arcname)
343
+ except Exception as e:
344
+ print(f"Failed to create download ZIP: {e}")
345
+ download_zip_path = None
346
+
347
+ return (
348
+ md_content or "No markdown content generated",
349
+ info_text,
350
+ md_content or "No markdown content generated",
351
+ md_content or "No markdown content generated",
352
+ gr.update(value=download_zip_path, visible=True) if download_zip_path else gr.update(visible=False),
353
+ current_json
354
+ )
355
+
356
+ except Exception as e:
357
+ return f"An error occurred during processing: {e}", f"An error occurred during processing: {e}", "", "", gr.update(value=None), ""
358
+
359
+ def load_image_to_annotator(test_image_input):
360
+ """Loads an image into the annotation component."""
361
+ image = None
362
+
363
+ # Check the test image input
364
+ if test_image_input and test_image_input != "":
365
+ try:
366
+ image = read_image_v2(test_image_input)
367
+ except Exception as e:
368
+ return None
369
+
370
+ if image is None:
371
+ return None
372
+
373
+ # Return the format required by the annotation component
374
+ return {
375
+ "image": image,
376
+ "boxes": []
377
+ }
378
+
379
+ def clear_all_data():
380
+ """Clears all data."""
381
+ global processing_results
382
+
383
+ # Clean up the temporary directory
384
+ if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
385
+ import shutil
386
+ try:
387
+ shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
388
+ except Exception as e:
389
+ print(f"Failed to clean up temporary directory: {e}")
390
+
391
+ # Reset processing results
392
+ processing_results = {
393
+ 'original_image': None,
394
+ 'processed_image': None,
395
+ 'layout_result': None,
396
+ 'markdown_content': None,
397
+ 'cells_data': None,
398
+ 'temp_dir': None,
399
+ 'session_id': None,
400
+ 'result_paths': None,
401
+ 'annotation_data': None
402
+ }
403
+
404
+ return (
405
+ "", # Clear test image selection
406
+ None, # Clear annotation component
407
+ "Waiting for processing results...", # Reset info display
408
+ "## Waiting for processing results...", # Reset Markdown display
409
+ "🕐 Waiting for parsing results...", # Clear raw Markdown text
410
+ gr.update(visible=False), # Hide download button
411
+ "🕐 Waiting for parsing results..." # Clear JSON
412
+ )
413
+
414
+ def update_prompt_display(prompt_mode):
415
+ """Updates the displayed prompt content."""
416
+ return dict_promptmode_to_prompt[prompt_mode]
417
+
418
+ # ==================== Gradio Interface ====================
419
+ def create_gradio_interface():
420
+ """Creates the Gradio interface."""
421
+
422
+ # CSS styling to match the reference style
423
+ css = """
424
+ footer {
425
+ visibility: hidden;
426
+ }
427
+
428
+ #info_box {
429
+ padding: 10px;
430
+ background-color: #f8f9fa;
431
+ border-radius: 8px;
432
+ border: 1px solid #dee2e6;
433
+ margin: 10px 0;
434
+ font-size: 14px;
435
+ }
436
+
437
+ #markdown_tabs {
438
+ height: 100%;
439
+ }
440
+
441
+ #annotation_component {
442
+ border-radius: 8px;
443
+ }
444
+ """
445
+
446
+ with gr.Blocks(theme="ocean", css=css, title='dots.ocr - Annotation') as demo:
447
+
448
+ # Title
449
+ gr.HTML("""
450
+ <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
451
+ <h1 style="margin: 0; font-size: 2em;">🔍 dots.ocr - Annotation Version</h1>
452
+ </div>
453
+ <div style="text-align: center; margin-bottom: 10px;">
454
+ <em>Supports image annotation, drawing boxes, and sending box information to the model for OCR.</em>
455
+ </div>
456
+ """)
457
+
458
+ with gr.Row():
459
+ # Left side: Input and Configuration
460
+ with gr.Column(scale=1, variant="compact"):
461
+ gr.Markdown("### 📁 Select Example")
462
+ test_images = get_test_images()
463
+ test_image_input = gr.Dropdown(
464
+ label="Select Example",
465
+ choices=[""] + test_images,
466
+ value="",
467
+ show_label=True
468
+ )
469
+
470
+ # Button to load image into the annotation component
471
+ load_btn = gr.Button("📷 Load Image to Annotation Area", variant="secondary")
472
+
473
+ prompt_mode = gr.Dropdown(
474
+ label="Select Prompt",
475
+ # choices=["prompt_layout_all_en", "prompt_layout_only_en", "prompt_ocr", "prompt_grounding_ocr"],
476
+ choices=["prompt_grounding_ocr"],
477
+ value="prompt_grounding_ocr",
478
+ show_label=True,
479
+ info="If a box is drawn, 'prompt_grounding_ocr' mode will be used automatically."
480
+ )
481
+
482
+ # Display the current prompt content
483
+ prompt_display = gr.Textbox(
484
+ label="Current Prompt Content",
485
+ # value=dict_promptmode_to_prompt[list(dict_promptmode_to_prompt.keys())[0]],
486
+ value=dict_promptmode_to_prompt["prompt_grounding_ocr"],
487
+ lines=4,
488
+ max_lines=8,
489
+ interactive=False,
490
+ show_copy_button=True
491
+ )
492
+
493
+ gr.Markdown("### ⚙️ Actions")
494
+ process_btn = gr.Button("🔍 Parse", variant="primary")
495
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
496
+
497
+ gr.Markdown("### 🛠️ Configuration")
498
+
499
+ fitz_preprocess = gr.Checkbox(
500
+ label="Enable fitz_preprocess",
501
+ value=False,
502
+ info="Performs fitz preprocessing on the image input, converting the image to a PDF and then to a 200dpi image."
503
+ )
504
+
505
+ with gr.Row():
506
+ server_ip = gr.Textbox(
507
+ label="Server IP",
508
+ value=DEFAULT_CONFIG['ip']
509
+ )
510
+ server_port = gr.Number(
511
+ label="Port",
512
+ value=DEFAULT_CONFIG['port_vllm'],
513
+ precision=0
514
+ )
515
+
516
+ with gr.Row():
517
+ min_pixels = gr.Number(
518
+ label="Min Pixels",
519
+ value=DEFAULT_CONFIG['min_pixels'],
520
+ precision=0
521
+ )
522
+ max_pixels = gr.Number(
523
+ label="Max Pixels",
524
+ value=DEFAULT_CONFIG['max_pixels'],
525
+ precision=0
526
+ )
527
+
528
+ # Right side: Result Display
529
+ with gr.Column(scale=6, variant="compact"):
530
+ with gr.Row():
531
+ # Image Annotation Area
532
+ with gr.Column(scale=3):
533
+ gr.Markdown("### 🎯 Image Annotation Area")
534
+ gr.Markdown("""
535
+ **Instructions:**
536
+ - Method 1: Select an example image on the left and click "Load Image to Annotation Area".
537
+ - Method 2: Upload an image directly in the annotation area below (drag and drop or click to upload).
538
+ - Use the mouse to draw a box on the image to select the region for recognition.
539
+ - Only one box can be drawn. To draw a new one, please delete the old one first.
540
+ - **Hotkey: Press the Delete key to remove the selected box.**
541
+ - After drawing a box, clicking Parse will automatically use the Region OCR mode.
542
+ """)
543
+
544
+ annotator = image_annotator(
545
+ value=None,
546
+ label="Image Annotation",
547
+ height=600,
548
+ show_label=False,
549
+ elem_id="annotation_component",
550
+ single_box=True, # Only allow one box; a new box will replace the old one
551
+ box_min_size=10,
552
+ interactive=True,
553
+ disable_edit_boxes=True, # Disable the edit dialog
554
+ label_list=["OCR Region"], # Set the default label
555
+ label_colors=[(255, 0, 0)], # Set color to red
556
+ use_default_label=True, # Use the default label
557
+ image_type="pil" # Ensure it returns a PIL Image format
558
+ )
559
+
560
+ # Information Display
561
+ info_display = gr.Markdown(
562
+ "Waiting for processing results...",
563
+ elem_id="info_box"
564
+ )
565
+
566
+ # Result Display Area
567
+ with gr.Column(scale=3):
568
+ gr.Markdown("### ✅ Results")
569
+
570
+ with gr.Tabs(elem_id="markdown_tabs"):
571
+ with gr.TabItem("Markdown Rendered View"):
572
+ md_output = gr.Markdown(
573
+ "## Please upload an image and click the Parse button for recognition...",
574
+ label="Markdown Preview",
575
+ max_height=1000,
576
+ latex_delimiters=[
577
+ {"left": "$$", "right": "$$", "display": True},
578
+ {"left": "$", "right": "$", "display": False},
579
+ ],
580
+ show_copy_button=False,
581
+ elem_id="markdown_output"
582
+ )
583
+
584
+ with gr.TabItem("Markdown Raw Text"):
585
+ md_raw_output = gr.Textbox(
586
+ value="🕐 Waiting for parsing results...",
587
+ label="Markdown Raw Text",
588
+ max_lines=100,
589
+ lines=38,
590
+ show_copy_button=True,
591
+ elem_id="markdown_output",
592
+ show_label=False
593
+ )
594
+
595
+ with gr.TabItem("JSON Result"):
596
+ json_output = gr.Textbox(
597
+ value="🕐 Waiting for parsing results...",
598
+ label="JSON Result",
599
+ max_lines=100,
600
+ lines=38,
601
+ show_copy_button=True,
602
+ elem_id="markdown_output",
603
+ show_label=False
604
+ )
605
+
606
+ # Download Button
607
+ with gr.Row():
608
+ download_btn = gr.DownloadButton(
609
+ "⬇️ Download Results",
610
+ visible=False
611
+ )
612
+
613
+ # Event Binding
614
+
615
+ # When the prompt mode changes, update the displayed content
616
+ prompt_mode.change(
617
+ fn=update_prompt_display,
618
+ inputs=prompt_mode,
619
+ outputs=prompt_display,
620
+ show_progress=False
621
+ )
622
+
623
+ # Load image into the annotation component
624
+ load_btn.click(
625
+ fn=load_image_to_annotator,
626
+ inputs=[test_image_input],
627
+ outputs=annotator,
628
+ show_progress=False
629
+ )
630
+
631
+ # Process Inference
632
+ process_btn.click(
633
+ fn=process_image_inference_with_annotation,
634
+ inputs=[
635
+ annotator, test_image_input,
636
+ prompt_mode, server_ip, server_port, min_pixels, max_pixels,
637
+ fitz_preprocess
638
+ ],
639
+ outputs=[
640
+ md_output, info_display, md_raw_output, md_raw_output,
641
+ download_btn, json_output
642
+ ],
643
+ show_progress=True
644
+ )
645
+
646
+ # Clear Data
647
+ clear_btn.click(
648
+ fn=clear_all_data,
649
+ outputs=[
650
+ test_image_input, annotator,
651
+ info_display, md_output, md_raw_output,
652
+ download_btn, json_output
653
+ ],
654
+ show_progress=False
655
+ )
656
+
657
+ return demo
658
+
659
+ # ==================== Main Program ====================
660
+ if __name__ == "__main__":
661
+ demo = create_gradio_interface()
662
+ demo.queue().launch(
663
+ server_name="0.0.0.0",
664
+ server_port=7861, # Use a different port to avoid conflicts
665
+ debug=True
666
+ )
demo/demo_hf.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ if "LOCAL_RANK" not in os.environ:
3
+ os.environ["LOCAL_RANK"] = "0"
4
+
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
7
+ from qwen_vl_utils import process_vision_info
8
+ from dots_ocr.utils import dict_promptmode_to_prompt
9
+
10
+ def inference(image_path, prompt, model, processor):
11
+ # image_path = "demo/demo_image1.jpg"
12
+ messages = [
13
+ {
14
+ "role": "user",
15
+ "content": [
16
+ {
17
+ "type": "image",
18
+ "image": image_path
19
+ },
20
+ {"type": "text", "text": prompt}
21
+ ]
22
+ }
23
+ ]
24
+
25
+
26
+ # Preparation for inference
27
+ text = processor.apply_chat_template(
28
+ messages,
29
+ tokenize=False,
30
+ add_generation_prompt=True
31
+ )
32
+ image_inputs, video_inputs = process_vision_info(messages)
33
+ inputs = processor(
34
+ text=[text],
35
+ images=image_inputs,
36
+ videos=video_inputs,
37
+ padding=True,
38
+ return_tensors="pt",
39
+ )
40
+
41
+ inputs = inputs.to("cuda")
42
+
43
+ # Inference: Generation of the output
44
+ generated_ids = model.generate(**inputs, max_new_tokens=24000)
45
+ generated_ids_trimmed = [
46
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
47
+ ]
48
+ output_text = processor.batch_decode(
49
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
50
+ )
51
+ print(output_text)
52
+
53
+
54
+
55
+ if __name__ == "__main__":
56
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
57
+ model_path = "./weights/DotsOCR"
58
+ model = AutoModelForCausalLM.from_pretrained(
59
+ model_path,
60
+ attn_implementation="flash_attention_2",
61
+ torch_dtype=torch.bfloat16,
62
+ device_map="auto",
63
+ trust_remote_code=True
64
+ )
65
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
66
+
67
+ image_path = "demo/demo_image1.jpg"
68
+ for prompt_mode, prompt in dict_promptmode_to_prompt.items():
69
+ print(f"prompt: {prompt}")
70
+ inference(image_path, prompt, model, processor)
71
+
demo/demo_image1.jpg ADDED

Git LFS Details

  • SHA256: 90345584ccc2c4a883779e5d47693276e8cf3fe752700af4f03b3142ab46cfa2
  • Pointer size: 131 Bytes
  • Size of remote file: 773 kB
demo/demo_pdf1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:570c44a595f52e963d0522fb561b338c327550b37974448f4e4f43c605b72f42
3
+ size 461448
demo/demo_streamlit.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layout Inference Web Application
3
+
4
+ A Streamlit-based layout inference tool that supports image uploads and multiple backend inference engines.
5
+ """
6
+
7
+ import streamlit as st
8
+ import json
9
+ import os
10
+ import io
11
+ import tempfile
12
+ from PIL import Image
13
+ import requests
14
+
15
+ # Local utility imports
16
+
17
+ # from utils import infer
18
+
19
+ from dots_ocr.utils import dict_promptmode_to_prompt
20
+ from dots_ocr.utils.format_transformer import layoutjson2md
21
+ from dots_ocr.utils.layout_utils import draw_layout_on_image, post_process_cells
22
+ from dots_ocr.utils.image_utils import get_input_dimensions, get_image_by_fitz_doc
23
+ from dots_ocr.model.inference import inference_with_vllm
24
+ from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
25
+
26
+ import os
27
+ from PIL import Image
28
+ from dots_ocr.utils.demo_utils.display import read_image
29
+
30
+
31
+
32
+ # ==================== Configuration ====================
33
+ DEFAULT_CONFIG = {
34
+ 'ip': "127.0.0.1",
35
+ 'port_vllm': 8000,
36
+ 'min_pixels': MIN_PIXELS,
37
+ 'max_pixels': MAX_PIXELS,
38
+ 'test_images_dir': "./assets/showcase_origin",
39
+ }
40
+
41
+ # ==================== Utility Functions ====================
42
+
43
+
44
+ @st.cache_resource
45
+ def read_image_v2(img: str):
46
+ if img.startswith(("http://", "https://")):
47
+ with requests.get(img, stream=True) as response:
48
+ response.raise_for_status()
49
+ img = Image.open(io.BytesIO(response.content))
50
+
51
+ if isinstance(img, str):
52
+ # img = transform_image_path(img)
53
+ img, _, _ = read_image(img, use_native=True)
54
+ elif isinstance(img, Image.Image):
55
+ pass
56
+ else:
57
+ raise ValueError(f"Invalid image type: {type(img)}")
58
+ return img
59
+
60
+
61
+ # ==================== UI Components ====================
62
+ def create_config_sidebar():
63
+ """Create configuration sidebar"""
64
+ st.sidebar.header("Configuration Parameters")
65
+
66
+ config = {}
67
+ config['prompt_key'] = st.sidebar.selectbox("Prompt Mode", list(dict_promptmode_to_prompt.keys()))
68
+ config['ip'] = st.sidebar.text_input("Server IP", DEFAULT_CONFIG['ip'])
69
+ config['port'] = st.sidebar.number_input("Port", min_value=1000, max_value=9999, value=DEFAULT_CONFIG['port_vllm'])
70
+ # config['eos_word'] = st.sidebar.text_input("EOS Word", DEFAULT_CONFIG['eos_word'])
71
+
72
+ # Image configuration
73
+ st.sidebar.subheader("Image Configuration")
74
+ config['min_pixels'] = st.sidebar.number_input("Min Pixels", value=DEFAULT_CONFIG['min_pixels'])
75
+ config['max_pixels'] = st.sidebar.number_input("Max Pixels", value=DEFAULT_CONFIG['max_pixels'])
76
+
77
+ return config
78
+
79
+ def get_image_input():
80
+ """Get image input"""
81
+ st.markdown("#### Image Input")
82
+
83
+ input_mode = st.pills(label="Select input method", options=["Upload Image", "Enter Image URL/Path", "Select Test Image"], key="input_mode", label_visibility="collapsed")
84
+
85
+ if input_mode == "Upload Image":
86
+ # File uploader
87
+ uploaded_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
88
+ if uploaded_file is not None:
89
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
90
+ tmp_file.write(uploaded_file.getvalue())
91
+ return tmp_file.name
92
+ elif input_mode == 'Enter Image URL/Path':
93
+ # URL input
94
+ img_url_input = st.text_input("Enter Image URL/Path")
95
+ return img_url_input
96
+
97
+ elif input_mode == 'Select Test Image':
98
+ # Test image selection
99
+ test_images = []
100
+ test_dir = DEFAULT_CONFIG['test_images_dir']
101
+ if os.path.exists(test_dir):
102
+ test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)]
103
+ img_url_test = st.selectbox("Select Test Image", [""] + test_images)
104
+ return img_url_test
105
+ else:
106
+ raise ValueError(f"Invalid input mode: {input_mode}")
107
+
108
+ return None
109
+
110
+
111
+
112
+ def process_and_display_results(output: str, image: Image.Image, config: dict):
113
+ """Process and display inference results"""
114
+ prompt, response = output['prompt'], output['response']
115
+
116
+ try:
117
+ col1, col2 = st.columns(2)
118
+ # st.markdown('---')
119
+ cells = json.loads(response)
120
+ # image = Image.open(img_url)
121
+
122
+ # Post-processing
123
+ cells = post_process_cells(
124
+ image, cells,
125
+ image.width, image.height,
126
+ min_pixels=config['min_pixels'],
127
+ max_pixels=config['max_pixels']
128
+ )
129
+
130
+ # Calculate input dimensions
131
+ input_width, input_height = get_input_dimensions(
132
+ image,
133
+ min_pixels=config['min_pixels'],
134
+ max_pixels=config['max_pixels']
135
+ )
136
+ st.markdown('---')
137
+ st.write(f'Input Dimensions: {input_width} x {input_height}')
138
+ # st.write(f'Prompt: {prompt}')
139
+ # st.markdown(f'模型原始输出: <span style="color:blue">{result}</span>', unsafe_allow_html=True)
140
+ # st.write('模型原始输出:')
141
+ # st.write(response)
142
+ # st.write('后处理结果:', str(cells))
143
+ st.text_area('Original Model Output', response, height=200)
144
+ st.text_area('Post-processed Result', str(cells), height=200)
145
+ # 显示结果
146
+ # st.title("Layout推理结果")
147
+
148
+ with col1:
149
+ # st.markdown("##### 可视化结果")
150
+ new_image = draw_layout_on_image(
151
+ image, cells,
152
+ resized_height=None, resized_width=None,
153
+ # text_key='text',
154
+ fill_bbox=True, draw_bbox=True
155
+ )
156
+ st.markdown('##### Visualization Result')
157
+ st.image(new_image, width=new_image.width)
158
+ # st.write(f"尺寸: {new_image.width} x {new_image.height}")
159
+
160
+ with col2:
161
+ # st.markdown("##### Markdown格式")
162
+ md_code = layoutjson2md(image, cells, text_key='text')
163
+ # md_code = fix_streamlit_formula(md_code)
164
+ st.markdown('##### Markdown Format')
165
+ st.markdown(md_code, unsafe_allow_html=True)
166
+
167
+ except json.JSONDecodeError:
168
+ st.error("Model output is not a valid JSON format")
169
+ except Exception as e:
170
+ st.error(f"Error processing results: {e}")
171
+
172
+ # ==================== Main Application ====================
173
+ def main():
174
+ """Main application function"""
175
+ st.set_page_config(page_title="Layout Inference Tool", layout="wide")
176
+ st.title("🔍 Layout Inference Tool")
177
+
178
+ # Configuration
179
+ config = create_config_sidebar()
180
+ prompt = dict_promptmode_to_prompt[config['prompt_key']]
181
+ st.sidebar.info(f"Current Prompt: {prompt}")
182
+
183
+ # Image input
184
+ img_url = get_image_input()
185
+ start_button = st.button('🚀 Start Inference', type="primary")
186
+
187
+ if img_url is not None and img_url.strip() != "":
188
+ try:
189
+ # processed_image = read_image_v2(img_url)
190
+ origin_image = read_image_v2(img_url)
191
+ st.write(f"Original Dimensions: {origin_image.width} x {origin_image.height}")
192
+ # processed_image = get_image_by_fitz_doc(origin_image, target_dpi=200)
193
+ processed_image = origin_image
194
+ except Exception as e:
195
+ st.error(f"Failed to read image: {e}")
196
+ return
197
+ else:
198
+ st.info("Please enter an image URL/path or upload an image")
199
+ return
200
+
201
+ output = None
202
+ # Inference button
203
+ if start_button:
204
+ with st.spinner(f"Inferring... Server: {config['ip']}:{config['port']}"):
205
+
206
+ response = inference_with_vllm(
207
+ processed_image, prompt, config['ip'], config['port'],
208
+ # config['min_pixels'], config['max_pixels']
209
+ )
210
+ output = {
211
+ 'prompt': prompt,
212
+ 'response': response,
213
+ }
214
+ else:
215
+ st.image(processed_image, width=500)
216
+
217
+ # Process results
218
+ if output:
219
+ process_and_display_results(output, processed_image, config)
220
+
221
+ if __name__ == "__main__":
222
+ main()
demo/demo_vllm.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ from openai import OpenAI
5
+ from transformers.utils.versions import require_version
6
+ from PIL import Image
7
+ import io
8
+ import base64
9
+ from dots_ocr.utils import dict_promptmode_to_prompt
10
+ from dots_ocr.model.inference import inference_with_vllm
11
+
12
+
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--ip", type=str, default="localhost")
15
+ parser.add_argument("--port", type=str, default="8000")
16
+ parser.add_argument("--model_name", type=str, default="model")
17
+ parser.add_argument("--prompt_mode", type=str, default="prompt_layout_all_en")
18
+
19
+ args = parser.parse_args()
20
+
21
+ require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
22
+
23
+
24
+ def main():
25
+ addr = f"http://{args.ip}:{args.port}/v1"
26
+ image_path = "demo/demo_image1.jpg"
27
+ prompt = dict_promptmode_to_prompt[args.prompt_mode]
28
+ image = Image.open(image_path)
29
+ response = inference_with_vllm(
30
+ image,
31
+ prompt,
32
+ ip=args.ip,
33
+ port=args.port,
34
+ temperature=0.1,
35
+ top_p=0.9,
36
+ model_name=args.model_name,
37
+ )
38
+ print(f"response: {response}")
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
demo/launch_model_vllm.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # download model to /path/to/model
2
+ if [ -z "$NODOWNLOAD" ]; then
3
+ python3 tools/download_model.py
4
+ fi
5
+
6
+ # register model to vllm
7
+ hf_model_path=./weights/DotsOCR # Path to your downloaded model weights
8
+ export PYTHONPATH=$(dirname "$hf_model_path"):$PYTHONPATH
9
+ sed -i '/^from vllm\.entrypoints\.cli\.main import main$/a\
10
+ from DotsOCR import modeling_dots_ocr_vllm' `which vllm`
11
+
12
+ # launch vllm server
13
+ model_name=model
14
+ CUDA_VISIBLE_DEVICES=0 vllm serve ${hf_model_path} --tensor-parallel-size 1 --gpu-memory-utilization 0.95 --chat-template-content-format string --served-model-name ${model_name} --trust-remote-code
15
+
16
+ # # run python demo after launch vllm server
17
+ # python demo/demo_vllm.py
docker/Dockerfile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from vllm/vllm-openai:v0.9.1
2
+
3
+ RUN pip3 install flash_attn==2.8.0.post2
4
+ RUN pip3 install transformers==4.51.3
docker/docker-compose.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ dots-ocr-server:
5
+ image: dots-ocr:latest
6
+ container_name: dots-ocr-container
7
+ ports:
8
+ - "8000:8000"
9
+ volumes:
10
+ #download model to local,model url:https://www.modelscope.cn/models/rednote-hilab/dots.ocr
11
+ - ./model/dots.ocr:/workspace/weights/DotsOCR
12
+ environment:
13
+ - PYTHONPATH=/workspace/weights:$PYTHONPATH
14
+ deploy:
15
+ resources:
16
+ reservations:
17
+ devices:
18
+ - capabilities: [gpu]
19
+ device_ids: ['0']
20
+ entrypoint: /bin/bash
21
+ command:
22
+ - -c
23
+ - |
24
+ set -ex;
25
+ echo '--- Starting setup and server ---';
26
+ echo 'Modifying vllm entrypoint...';
27
+ # This sed command patches the vllm entrypoint script to import the custom modeling code.
28
+ sed -i '/^from vllm\.entrypoints\.cli\.main import main/a from DotsOCR import modeling_dots_ocr_vllm' $(which vllm) && \
29
+ echo 'vllm script after patch:';
30
+ # Show the patched part of the vllm script for verification.
31
+ grep -A 1 'from vllm.entrypoints.cli.main import main' $(which vllm) && \
32
+ echo 'Starting server...';
33
+ # Use 'exec' to replace the current shell process with the vllm server,
34
+ # ensuring logs are properly forwarded to Docker's standard output.
35
+ exec vllm serve /workspace/weights/DotsOCR \
36
+ --tensor-parallel-size 1 \
37
+ --gpu-memory-utilization 0.8 \
38
+ --chat-template-content-format string \
39
+ --served-model-name dotsocr-model \
40
+ --trust-remote-code
41
+
42
+
43
+
44
+
dots.ocr LICENSE AGREEMENT ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dots.ocr LICENSE AGREEMENT
2
+
3
+ Effective Date: [ August 8, 2025]
4
+
5
+ Copyright Holder: [Xingyin Information Technology (Shanghai) Co., Ltd]
6
+
7
+ This License Agreement (“Agreement”) governs Your use, reproduction, modification, and distribution of dots.ocr (the "Model Materials"). This Agreement is designed to maximize the openness and use of the Model Materials while addressing the unique legal, ethical, and technical challenges posed by large language models.
8
+
9
+ WHEREAS, Licensor has developed the dots.ocr document parsing model and intends to distribute the Model Materials under an open‑source framework;
10
+ WHEREAS, traditional open-source licenses (e.g., the MIT License) may not fully address the complexity inherent complexities of document parsing models, namely their multiple components (code, weights, training data), potential ethical risks, data‑governance issues, and intellectual‑property and liability questions regarding AI‑generated content;
11
+ WHEREAS, Licensor seeks to provide a legal framework that ensures maximum access to and use of the Model Materials while clearly defining the rights, obligations, and liabilities of Licensee;
12
+
13
+ THEREFORE, the parties agree that, subject to the MIT License, they shall be bound by the following terms and conditions:
14
+
15
+ 1. Definitions and Interpretation
16
+ Purpose: To define key terms used in this Agreement, particularly "Model Materials," ensuring clarity of the license scope beyond traditional software code. To clarify the order of precedence between this Agreement and the MIT License to avoid conflict.
17
+
18
+ 1.1 “Licensor” shall mean the entity providing the Model Materials under this Agreement, namely [Xingyin Information Technology (Shanghai) Co., Ltd].
19
+
20
+ 1.2 “Licensee” or "You" shall mean any individual or entity exercising permissions granted by this Agreement.
21
+
22
+ 1.3 “Model Materials” shall mean all materials provided by Licensor under this Agreement, including but not limited to:
23
+         (a) one or more machine‑learning models, including architecture and trained parameters (i.e., model weights);
24
+         (b) all associated preprocessing, training, inference, and fine‑tuning code;
25
+         (c) training datasets and evaluation scripts (or their detailed descriptions and access mechanisms); and
26
+         (d) any accompanying documentation, metadata, and tools.
27
+ The above Model Materials shall be subject to the content published on the Licensor’s website or GitHub repository at https://github.com/rednote-hilab/dots.ocr.
28
+
29
+ 1.4 “Outputs” shall mean any content generated through the use of the Model Materials, such as text, tables, code,layout information, and formulas extracted from documents.
30
+
31
+ 1.5 “MIT License” shall mean The MIT Open Source License published by the Massachusetts Institute of Technology.
32
+
33
+ 1.6   Priority of Agreement. In the event of any conflict or inconsistency between this Agreement and the MIT License, the terms of the MIT License shall prevail. However, if the terms of the MIT License are ambiguous or silent on a particular matter, the provisions of this Agreement shall apply and supplement the MIT License.
34
+
35
+ 2. Grant of Rights and Scope of Use
36
+
37
+ Purpose: To grant broad, permissive rights to the Licensee for the Model Materials—including code, weights, data, and documentation—to ensure maximum openness and flexibility while clarifying the free use of model-generated content. Additionally, it clarifies the feasibility of transitioning from open-source to commercial‑use and the use of OpenAPI interfaces.
38
+
39
+ 2.1   Grant of Copyright License. Subject to Licensee's compliance with this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non‑exclusive, no-charge, royalty‑free copyright license to use (run or test), reproduce, modify, create derivative works of, merge, publish, distribute the Model Materials; sublicense and/or sell copies of the Model Materials or any derivative works thereof; and incorporate the unmodified or modified Model Materials into proprietary products or services, including for commercial purposes, software‑as‑a‑service (SaaS) offerings, or via OpenAPI or other interfaces.
40
+
41
+ 2.2   Fundamental Capabilities. The Model Materials only provide the fundamental model’s capabilities. Licensees may develop derivative AI applications or undertake task‑specific training thereon.
42
+
43
+ 2.3   From Open Source to Commercial Use. The open-source release does not preclude Licensor’s commercial exploitation of the Model Materials, in whole or in part. Any such commercial use shall, at that time, be subject to license agreements between Licensor and applicable users.
44
+
45
+ 2.4   API‑Service Exception. Licensees who access the Model Materials through API calls or provide model services via API interfaces(without directly distributing model weights )shall not be subject to this Agreement unless otherwise expressly agreed. Instead, such use shall be governed by the API terms of use published by Licensor (if any).
46
+
47
+ 3. Acceptable Use Policy and Prohibited Uses
48
+
49
+ 3.1   Responsible Use. Licensee must use the Model Materials in a responsible, ethical, and lawful manner, in compliance with all applicable laws, regulations, industry standards, and best practices.
50
+
51
+ 3.2   Enterprise On‑Premises Deployment. The Licensee may deploy the Model Materials in closed‑source, on‑premises enterprise environments.
52
+
53
+ 3.3   Prohibited Uses. Any breach of the prohibitions below will result in the automatic termination of all licenses granted under this Agreement. Licensee agrees not to use the Model Materials or any derivative works thereof, in connection with:
54
+ (a) Identification and Utilization of Illegal/Harmful Content:Includes identifying graphic/text materials used for counterfeiting certificates/invoices, perpetrating fraud, or launching cyberattacks; or processing images containing illegal content such as violence, criminal activities, disinformation, or child exploitation.
55
+ (b) Privacy Infringement and Discriminatory Practices:Extracting personal sensitive information (e.g., ID numbers, medical records, biometric data) or protected characteristics (e.g., race, gender) from images without legal authorization or consent, for purposes of privacy violation, automated discriminatory decision-making, or harassment.
56
+ (c) Copyright Restrictions:Licensees shall not use the tool for unauthorized digitization of publications/document scanning or bulk scraping of content. Any use involving publications or other copyright-protected materials must first obtain relevant permissions.
57
+
58
+ 4. Intellectual Property Ownership and Contributions
59
+
60
+ 4.1   Licensor's Copyright Reservation. Licensor reserves all right, title, and interest in and to the Model Materials (including the model architecture, parameters, code, and original training data), except as expressly licensed herein. The original copyright of the Model Materials belongs to the Licensor.
61
+
62
+ 4.2   Patent License. Subject to the terms and conditions of this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model Materials, where such license applies only to those patent claims licensable by the Lisensor that are necessarily infringed by its contribution(s).
63
+ If Licensee institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model Materials constitute direct or contributory patent infringement, then any patent licenses granted under this License for the Model Materials shall terminate as of the date such litigation is asserted or filed.
64
+
65
+ 4.3   Outputs: The Outputs generated through the use of the Model Materials generally refer to text, tables, layouts, and other content extracted from documents or images. The extracted content itself does not generate new intellectual property rights, and all intellectual property remains with the original authors or copyright holders. The Licensee is responsible for due diligence regarding the legality of the Outputs, particularly where the content extracted by the OCR model may be substantially similar to existing copyrighted works, which could present intellectual property infringement risks. The Licensor assumes no liability for such infringements.
66
+ 4.4   Trademarks. Nothing in this License permits Licensee to make use of Licensor’s trademarks, trade names, logos (e.g., “rednote,” “Xiaohongshu,” “dots.ocr”) or to otherwise suggest endorsement or misrepresent the relationship between the parties, unless Licensor’s prior written approval is granted.
67
+
68
+ 5. Data Governance, Privacy, and Security
69
+
70
+ 5.1   Data Quality and Bias. Licensee shall use training data from lawful sources and is encouraged to conduct due diligence before deploying the Model Materials and to take reasonable steps to mitigate any known biases in its training data or applications.
71
+
72
+ 5.2   Privacy Protection.
73
+         (a) Sensitive‑Data Restrictions. It is prohibited to use the Model Materials to process,or extract infer sensitive personal data protected under specific laws (such as GDPR or HIPAA), particularly when dealing with documents containing personally identifiable information (such as ID numbers, health data, financial information, etc.), unless Licensee has obtained all necessary consents, lawful basis, or authorizations, and has implemented adequate anonymization, pseudonymization, or other privacy-enhancing technologies.
74
+         (b) Data Minimization and Purpose Limitation. The Licensee shall follow the principle of data minimization when using the OCR Model, processing only the user data necessary for specific, explicit, and lawful purposes. Specifically, the OCR Model should avoid processing unnecessary sensitive data and ensure compliance with applicable privacy protection laws during data handling.
75
+         (c) Transparency. Licensee shall provide clear and transparent privacy policies and terms of use when processing user data, particularly during document scanning and information extraction. .
76
+
77
+ 5.3   Security Measures. Licensee shall implement appropriate technical and administrative safeguards to protect the Model Materials and any associated data against unauthorized access, disclosure, alteration, or destruction. Such measures may include, but are not limited to, encryption, access controls, logging, and audit trails.
78
+
79
+ 5.4   Further Training. Licensee may only use user‑provided input or Outputs for training, fine-tuning, or improving other AI models if it has obtained the specific and informed consent of data subjects.
80
+
81
+ 6. Disclaimer of Warranty and Limitation of Liability
82
+
83
+ 6.1 “AS IS” Basis. Unless required by applicable law, the Model Materials are provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. Licensee is solely responsible for determining the appropriateness of using or redistributing the Model Materials and assume any risks associated with the exercise of permissions under this License. Licensor does not provide any warranty of non-infringement but represents that no infringing code has been knowingly included.
84
+
85
+ 6.2   Outputs Disclaimer. As a neutral technology, Licensor disclaims all liability for the accuracy, completeness, reliability, safety, legality, or suitability of any Outputs. The Licensee is solely responsible for verifying the accuracy and appropriateness of AI-generated content and shall provide appropriate disclosures when publishing or relying upon such content.
86
+
87
+ 6.3   Limitation of Liability and Recourse. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall Licensor or contributors be liable for any claims, damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model Materials (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Licensor has been advised of the possibility of such damages. If such losses are incurred, recourse may be sought against the Licensee responsible for causing the loss.
88
+
89
+ 6.4   Content‑Filtering Disclaimer. Although the Model Materials may include content‑filtering mechanisms, Licensor makes no warranties of any kind regarding the stability, quality, accuracy, completeness, or any specific outcome of Outputs. Licensee is solely responsible for reviewing, verifying, and performing quality control on Outputs and assumes all associated risks and liabilities.
90
+
91
+ 7. Attribution and License Reservation
92
+
93
+ 7.1   License. When distributing or redistributing the Model Materials, Licensee must give any other recipients of the Model Materials a copy of this Agreement.
94
+
95
+ 7.2   Copyright and Notices. When distributing any part of the Model Materials, Licensee must retain all copyright, patent, trademark, and attribution notices included in the Model Materials.
96
+
97
+ 7.3   Attribution. Licensee is encouraged to prominently display the name of Licensor and the Model Materials in any public statements, products, or services that contain the Model Materials (or any derivative works thereof), to promote transparency and community trust. If Licensee distributes modified weights or fine‑tuned models based on the Model Materials, Licensee must prominently display the following statement in the related website or documentation: “Built with dots.ocr.”
98
+
99
+ 8. Governing Law and Dispute Resolution
100
+
101
+ 8.1   Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the People’s Republic of China, without regard to its conflict of laws principles.
102
+
103
+ 8.2   Dispute Resolution. Any dispute claim, or disagreement arising out of or relating to this Agreement shall first be resolved through amicable consultation. If such consultation fails, the dispute shall be submitted to the Hangzhou Arbitration Commission for arbitration. The arbitration shall be conducted in accordance with the laws of China, and the place of arbitration shall be [Hangzhou, China]. The arbitral award shall be final and binding upon both parties.
104
+
105
+ 9. Regulatory Compliance Amendments
106
+ In the event that any part of this Agreement becomes invalid or requires adjustment due to changes in applicable laws or regulations, Licensor reserves the right to issue a revised version of this Agreement. Licensee shall migrate to the new version within [e.g., ninety (90)] days of its release; otherwise, all rights granted under this Agreement shall automatically terminate.
107
+
108
+ 10. Security Reporting
109
+ Licensee discovering any security vulnerability in the Model Materials may report it to Licensor via: dots-feedback@xiaohongshu.com. Licensee shall not disclose vulnerability details until Licensor issues an official remediation, unless otherwise required by law.
dots_ocr/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .parser import DotsOCRParser
dots_ocr/model/inference.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import io
3
+ import base64
4
+ import math
5
+ from PIL import Image
6
+ import requests
7
+ from dots_ocr.utils.image_utils import PILimage_to_base64
8
+ from openai import OpenAI
9
+ import os
10
+
11
+
12
+ def inference_with_vllm(
13
+ image,
14
+ prompt,
15
+ ip="localhost",
16
+ port=8000,
17
+ temperature=0.1,
18
+ top_p=0.9,
19
+ max_completion_tokens=32768,
20
+ model_name='model',
21
+ ):
22
+
23
+ addr = f"http://{ip}:{port}/v1"
24
+ client = OpenAI(api_key="{}".format(os.environ.get("API_KEY", "0")), base_url=addr)
25
+ messages = []
26
+ messages.append(
27
+ {
28
+ "role": "user",
29
+ "content": [
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {"url": PILimage_to_base64(image)},
33
+ },
34
+ {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"} # if no "<|img|><|imgpad|><|endofimg|>" here,vllm v1 will add "\n" here
35
+ ],
36
+ }
37
+ )
38
+ try:
39
+ response = client.chat.completions.create(
40
+ messages=messages,
41
+ model=model_name,
42
+ max_completion_tokens=max_completion_tokens,
43
+ temperature=temperature,
44
+ top_p=top_p)
45
+ response = response.choices[0].message.content
46
+ return response
47
+ except requests.exceptions.RequestException as e:
48
+ print(f"request error: {e}")
49
+ return None
50
+
dots_ocr/parser.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from tqdm import tqdm
4
+ from multiprocessing.pool import ThreadPool, Pool
5
+ import argparse
6
+
7
+
8
+ from dots_ocr.model.inference import inference_with_vllm
9
+ from dots_ocr.utils.consts import image_extensions, MIN_PIXELS, MAX_PIXELS
10
+ from dots_ocr.utils.image_utils import get_image_by_fitz_doc, fetch_image, smart_resize
11
+ from dots_ocr.utils.doc_utils import fitz_doc_to_image, load_images_from_pdf
12
+ from dots_ocr.utils.prompts import dict_promptmode_to_prompt
13
+ from dots_ocr.utils.layout_utils import post_process_output, draw_layout_on_image, pre_process_bboxes
14
+ from dots_ocr.utils.format_transformer import layoutjson2md
15
+
16
+
17
+ class DotsOCRParser:
18
+ """
19
+ parse image or pdf file
20
+ """
21
+
22
+ def __init__(self,
23
+ ip='localhost',
24
+ port=8000,
25
+ model_name='model',
26
+ temperature=0.1,
27
+ top_p=1.0,
28
+ max_completion_tokens=16384,
29
+ num_thread=64,
30
+ dpi = 200,
31
+ output_dir="./output",
32
+ min_pixels=None,
33
+ max_pixels=None,
34
+ use_hf=False,
35
+ ):
36
+ self.dpi = dpi
37
+
38
+ # default args for vllm server
39
+ self.ip = ip
40
+ self.port = port
41
+ self.model_name = model_name
42
+ # default args for inference
43
+ self.temperature = temperature
44
+ self.top_p = top_p
45
+ self.max_completion_tokens = max_completion_tokens
46
+ self.num_thread = num_thread
47
+ self.output_dir = output_dir
48
+ self.min_pixels = min_pixels
49
+ self.max_pixels = max_pixels
50
+
51
+ self.use_hf = use_hf
52
+ if self.use_hf:
53
+ self._load_hf_model()
54
+ print(f"use hf model, num_thread will be set to 1")
55
+ else:
56
+ print(f"use vllm model, num_thread will be set to {self.num_thread}")
57
+ assert self.min_pixels is None or self.min_pixels >= MIN_PIXELS
58
+ assert self.max_pixels is None or self.max_pixels <= MAX_PIXELS
59
+
60
+ def _load_hf_model(self):
61
+ import torch
62
+ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
63
+ from qwen_vl_utils import process_vision_info
64
+
65
+ model_path = "./weights/DotsOCR"
66
+ self.model = AutoModelForCausalLM.from_pretrained(
67
+ model_path,
68
+ attn_implementation="flash_attention_2",
69
+ torch_dtype=torch.bfloat16,
70
+ device_map="auto",
71
+ trust_remote_code=True
72
+ )
73
+ self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True,use_fast=True)
74
+ self.process_vision_info = process_vision_info
75
+
76
+ def _inference_with_hf(self, image, prompt):
77
+ messages = [
78
+ {
79
+ "role": "user",
80
+ "content": [
81
+ {
82
+ "type": "image",
83
+ "image": image
84
+ },
85
+ {"type": "text", "text": prompt}
86
+ ]
87
+ }
88
+ ]
89
+
90
+ # Preparation for inference
91
+ text = self.processor.apply_chat_template(
92
+ messages,
93
+ tokenize=False,
94
+ add_generation_prompt=True
95
+ )
96
+ image_inputs, video_inputs = self.process_vision_info(messages)
97
+ inputs = self.processor(
98
+ text=[text],
99
+ images=image_inputs,
100
+ videos=video_inputs,
101
+ padding=True,
102
+ return_tensors="pt",
103
+ )
104
+
105
+ inputs = inputs.to("cuda")
106
+
107
+ # Inference: Generation of the output
108
+ generated_ids = self.model.generate(**inputs, max_new_tokens=24000)
109
+ generated_ids_trimmed = [
110
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
111
+ ]
112
+ response = self.processor.batch_decode(
113
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
114
+ )[0]
115
+ return response
116
+
117
+ def _inference_with_vllm(self, image, prompt):
118
+ response = inference_with_vllm(
119
+ image,
120
+ prompt,
121
+ model_name=self.model_name,
122
+ ip=self.ip,
123
+ port=self.port,
124
+ temperature=self.temperature,
125
+ top_p=self.top_p,
126
+ max_completion_tokens=self.max_completion_tokens,
127
+ )
128
+ return response
129
+
130
+ def get_prompt(self, prompt_mode, bbox=None, origin_image=None, image=None, min_pixels=None, max_pixels=None):
131
+ prompt = dict_promptmode_to_prompt[prompt_mode]
132
+ if prompt_mode == 'prompt_grounding_ocr':
133
+ assert bbox is not None
134
+ bboxes = [bbox]
135
+ bbox = pre_process_bboxes(origin_image, bboxes, input_width=image.width, input_height=image.height, min_pixels=min_pixels, max_pixels=max_pixels)[0]
136
+ prompt = prompt + str(bbox)
137
+ return prompt
138
+
139
+ # def post_process_results(self, response, prompt_mode, save_dir, save_name, origin_image, image, min_pixels, max_pixels)
140
+ def _parse_single_image(
141
+ self,
142
+ origin_image,
143
+ prompt_mode,
144
+ save_dir,
145
+ save_name,
146
+ source="image",
147
+ page_idx=0,
148
+ bbox=None,
149
+ fitz_preprocess=False,
150
+ ):
151
+ min_pixels, max_pixels = self.min_pixels, self.max_pixels
152
+ if prompt_mode == "prompt_grounding_ocr":
153
+ min_pixels = min_pixels or MIN_PIXELS # preprocess image to the final input
154
+ max_pixels = max_pixels or MAX_PIXELS
155
+ if min_pixels is not None: assert min_pixels >= MIN_PIXELS, f"min_pixels should >= {MIN_PIXELS}"
156
+ if max_pixels is not None: assert max_pixels <= MAX_PIXELS, f"max_pixels should <+ {MAX_PIXELS}"
157
+
158
+ if source == 'image' and fitz_preprocess:
159
+ image = get_image_by_fitz_doc(origin_image, target_dpi=self.dpi)
160
+ image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
161
+ else:
162
+ image = fetch_image(origin_image, min_pixels=min_pixels, max_pixels=max_pixels)
163
+ input_height, input_width = smart_resize(image.height, image.width)
164
+ prompt = self.get_prompt(prompt_mode, bbox, origin_image, image, min_pixels=min_pixels, max_pixels=max_pixels)
165
+ if self.use_hf:
166
+ response = self._inference_with_hf(image, prompt)
167
+ else:
168
+ response = self._inference_with_vllm(image, prompt)
169
+ result = {'page_no': page_idx,
170
+ "input_height": input_height,
171
+ "input_width": input_width
172
+ }
173
+ if source == 'pdf':
174
+ save_name = f"{save_name}_page_{page_idx}"
175
+ if prompt_mode in ['prompt_layout_all_en', 'prompt_layout_only_en', 'prompt_grounding_ocr']:
176
+ cells, filtered = post_process_output(
177
+ response,
178
+ prompt_mode,
179
+ origin_image,
180
+ image,
181
+ min_pixels=min_pixels,
182
+ max_pixels=max_pixels,
183
+ )
184
+ if filtered and prompt_mode != 'prompt_layout_only_en': # model output json failed, use filtered process
185
+ json_file_path = os.path.join(save_dir, f"{save_name}.json")
186
+ with open(json_file_path, 'w', encoding="utf-8") as w:
187
+ json.dump(response, w, ensure_ascii=False)
188
+
189
+ image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
190
+ origin_image.save(image_layout_path)
191
+ result.update({
192
+ 'layout_info_path': json_file_path,
193
+ 'layout_image_path': image_layout_path,
194
+ })
195
+
196
+ md_file_path = os.path.join(save_dir, f"{save_name}.md")
197
+ with open(md_file_path, "w", encoding="utf-8") as md_file:
198
+ md_file.write(cells)
199
+ result.update({
200
+ 'md_content_path': md_file_path
201
+ })
202
+ result.update({
203
+ 'filtered': True
204
+ })
205
+ else:
206
+ try:
207
+ image_with_layout = draw_layout_on_image(origin_image, cells)
208
+ except Exception as e:
209
+ print(f"Error drawing layout on image: {e}")
210
+ image_with_layout = origin_image
211
+
212
+ json_file_path = os.path.join(save_dir, f"{save_name}.json")
213
+ with open(json_file_path, 'w', encoding="utf-8") as w:
214
+ json.dump(cells, w, ensure_ascii=False)
215
+
216
+ image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
217
+ image_with_layout.save(image_layout_path)
218
+ result.update({
219
+ 'layout_info_path': json_file_path,
220
+ 'layout_image_path': image_layout_path,
221
+ })
222
+ if prompt_mode != "prompt_layout_only_en": # no text md when detection only
223
+ md_content = layoutjson2md(origin_image, cells, text_key='text')
224
+ md_content_no_hf = layoutjson2md(origin_image, cells, text_key='text', no_page_hf=True) # used for clean output or metric of omnidocbench、olmbench
225
+ md_file_path = os.path.join(save_dir, f"{save_name}.md")
226
+ with open(md_file_path, "w", encoding="utf-8") as md_file:
227
+ md_file.write(md_content)
228
+ md_nohf_file_path = os.path.join(save_dir, f"{save_name}_nohf.md")
229
+ with open(md_nohf_file_path, "w", encoding="utf-8") as md_file:
230
+ md_file.write(md_content_no_hf)
231
+ result.update({
232
+ 'md_content_path': md_file_path,
233
+ 'md_content_nohf_path': md_nohf_file_path,
234
+ })
235
+ else:
236
+ image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
237
+ origin_image.save(image_layout_path)
238
+ result.update({
239
+ 'layout_image_path': image_layout_path,
240
+ })
241
+
242
+ md_content = response
243
+ md_file_path = os.path.join(save_dir, f"{save_name}.md")
244
+ with open(md_file_path, "w", encoding="utf-8") as md_file:
245
+ md_file.write(md_content)
246
+ result.update({
247
+ 'md_content_path': md_file_path,
248
+ })
249
+
250
+ return result
251
+
252
+ def parse_image(self, input_path, filename, prompt_mode, save_dir, bbox=None, fitz_preprocess=False):
253
+ origin_image = fetch_image(input_path)
254
+ result = self._parse_single_image(origin_image, prompt_mode, save_dir, filename, source="image", bbox=bbox, fitz_preprocess=fitz_preprocess)
255
+ result['file_path'] = input_path
256
+ return [result]
257
+
258
+ def parse_pdf(self, input_path, filename, prompt_mode, save_dir):
259
+ print(f"loading pdf: {input_path}")
260
+ images_origin = load_images_from_pdf(input_path, dpi=self.dpi)
261
+ total_pages = len(images_origin)
262
+ tasks = [
263
+ {
264
+ "origin_image": image,
265
+ "prompt_mode": prompt_mode,
266
+ "save_dir": save_dir,
267
+ "save_name": filename,
268
+ "source":"pdf",
269
+ "page_idx": i,
270
+ } for i, image in enumerate(images_origin)
271
+ ]
272
+
273
+ def _execute_task(task_args):
274
+ return self._parse_single_image(**task_args)
275
+
276
+ if self.use_hf:
277
+ num_thread = 1
278
+ else:
279
+ num_thread = min(total_pages, self.num_thread)
280
+ print(f"Parsing PDF with {total_pages} pages using {num_thread} threads...")
281
+
282
+ results = []
283
+ with ThreadPool(num_thread) as pool:
284
+ with tqdm(total=total_pages, desc="Processing PDF pages") as pbar:
285
+ for result in pool.imap_unordered(_execute_task, tasks):
286
+ results.append(result)
287
+ pbar.update(1)
288
+
289
+ results.sort(key=lambda x: x["page_no"])
290
+ for i in range(len(results)):
291
+ results[i]['file_path'] = input_path
292
+ return results
293
+
294
+ def parse_file(self,
295
+ input_path,
296
+ output_dir="",
297
+ prompt_mode="prompt_layout_all_en",
298
+ bbox=None,
299
+ fitz_preprocess=False
300
+ ):
301
+ output_dir = output_dir or self.output_dir
302
+ output_dir = os.path.abspath(output_dir)
303
+ filename, file_ext = os.path.splitext(os.path.basename(input_path))
304
+ save_dir = os.path.join(output_dir, filename)
305
+ os.makedirs(save_dir, exist_ok=True)
306
+
307
+ if file_ext == '.pdf':
308
+ results = self.parse_pdf(input_path, filename, prompt_mode, save_dir)
309
+ elif file_ext in image_extensions:
310
+ results = self.parse_image(input_path, filename, prompt_mode, save_dir, bbox=bbox, fitz_preprocess=fitz_preprocess)
311
+ else:
312
+ raise ValueError(f"file extension {file_ext} not supported, supported extensions are {image_extensions} and pdf")
313
+
314
+ print(f"Parsing finished, results saving to {save_dir}")
315
+ with open(os.path.join(output_dir, os.path.basename(filename)+'.jsonl'), 'w', encoding="utf-8") as w:
316
+ for result in results:
317
+ w.write(json.dumps(result, ensure_ascii=False) + '\n')
318
+
319
+ return results
320
+
321
+
322
+
323
+ def main():
324
+ prompts = list(dict_promptmode_to_prompt.keys())
325
+ parser = argparse.ArgumentParser(
326
+ description="dots.ocr Multilingual Document Layout Parser",
327
+ )
328
+
329
+ parser.add_argument(
330
+ "input_path", type=str,
331
+ help="Input PDF/image file path"
332
+ )
333
+
334
+ parser.add_argument(
335
+ "--output", type=str, default="./output",
336
+ help="Output directory (default: ./output)"
337
+ )
338
+
339
+ parser.add_argument(
340
+ "--prompt", choices=prompts, type=str, default="prompt_layout_all_en",
341
+ help="prompt to query the model, different prompts for different tasks"
342
+ )
343
+ parser.add_argument(
344
+ '--bbox',
345
+ type=int,
346
+ nargs=4,
347
+ metavar=('x1', 'y1', 'x2', 'y2'),
348
+ help='should give this argument if you want to prompt_grounding_ocr'
349
+ )
350
+ parser.add_argument(
351
+ "--ip", type=str, default="localhost",
352
+ help=""
353
+ )
354
+ parser.add_argument(
355
+ "--port", type=int, default=8000,
356
+ help=""
357
+ )
358
+ parser.add_argument(
359
+ "--model_name", type=str, default="model",
360
+ help=""
361
+ )
362
+ parser.add_argument(
363
+ "--temperature", type=float, default=0.1,
364
+ help=""
365
+ )
366
+ parser.add_argument(
367
+ "--top_p", type=float, default=1.0,
368
+ help=""
369
+ )
370
+ parser.add_argument(
371
+ "--dpi", type=int, default=200,
372
+ help=""
373
+ )
374
+ parser.add_argument(
375
+ "--max_completion_tokens", type=int, default=16384,
376
+ help=""
377
+ )
378
+ parser.add_argument(
379
+ "--num_thread", type=int, default=16,
380
+ help=""
381
+ )
382
+ parser.add_argument(
383
+ "--no_fitz_preprocess", action='store_true',
384
+ help="False will use tikz dpi upsample pipeline, good for images which has been render with low dpi, but maybe result in higher computational costs"
385
+ )
386
+ parser.add_argument(
387
+ "--min_pixels", type=int, default=None,
388
+ help=""
389
+ )
390
+ parser.add_argument(
391
+ "--max_pixels", type=int, default=None,
392
+ help=""
393
+ )
394
+ parser.add_argument(
395
+ "--use_hf", type=bool, default=False,
396
+ help=""
397
+ )
398
+ args = parser.parse_args()
399
+
400
+ dots_ocr_parser = DotsOCRParser(
401
+ ip=args.ip,
402
+ port=args.port,
403
+ model_name=args.model_name,
404
+ temperature=args.temperature,
405
+ top_p=args.top_p,
406
+ max_completion_tokens=args.max_completion_tokens,
407
+ num_thread=args.num_thread,
408
+ dpi=args.dpi,
409
+ output_dir=args.output,
410
+ min_pixels=args.min_pixels,
411
+ max_pixels=args.max_pixels,
412
+ use_hf=args.use_hf,
413
+ )
414
+
415
+ fitz_preprocess = not args.no_fitz_preprocess
416
+ if fitz_preprocess:
417
+ print(f"Using fitz preprocess for image input, check the change of the image pixels")
418
+ result = dots_ocr_parser.parse_file(
419
+ args.input_path,
420
+ prompt_mode=args.prompt,
421
+ bbox=args.bbox,
422
+ fitz_preprocess=fitz_preprocess,
423
+ )
424
+
425
+
426
+
427
+ if __name__ == "__main__":
428
+ main()
dots_ocr/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .prompts import dict_promptmode_to_prompt