Spaces:

redhairedshanks1
/

dots-ocr

Runtime error

App Files Files Community

redhairedshanks1 commited on Aug 18, 2025

Commit

b56e481

verified ·

1 Parent(s): 3a12210

Upload 61 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -35
.gitignore +123 -0
LICENSE +21 -0
NOTICE +0 -0
README.md +1228 -12
assets/blog.md +1044 -0
assets/chart.png +3 -0
assets/logo.png +3 -0
assets/showcase/Tibetan.png +3 -0
assets/showcase/formula1.png +3 -0
assets/showcase/formula2.png +3 -0
assets/showcase/formula3.png +3 -0
assets/showcase/grounding.png +3 -0
assets/showcase/kannada.png +3 -0
assets/showcase/nl.png +3 -0
assets/showcase/reading_order.png +3 -0
assets/showcase/russian.png +3 -0
assets/showcase/table1.png +3 -0
assets/showcase/table2.png +3 -0
assets/showcase/table3.png +3 -0
assets/showcase/tradition_zh.png +3 -0
assets/showcase_origin/Tibetan.png +3 -0
assets/showcase_origin/formula_1.jpg +3 -0
assets/showcase_origin/formula_2.jpg +3 -0
assets/showcase_origin/formula_3.jpg +3 -0
assets/showcase_origin/kannada.jpg +3 -0
assets/showcase_origin/nl.png +3 -0
assets/showcase_origin/reading_order.png +3 -0
assets/showcase_origin/russian.png +3 -0
assets/showcase_origin/table_1.jpg +3 -0
assets/showcase_origin/table_2.jpg +3 -0
assets/showcase_origin/table_3.jpg +3 -0
assets/showcase_origin/tradition_zh.png +3 -0
assets/wechat.png +3 -0
demo/demo_colab_remote_server.ipynb +0 -0
demo/demo_gradio.py +726 -0
demo/demo_gradio_annotion.py +666 -0
demo/demo_hf.py +71 -0
demo/demo_image1.jpg +3 -0
demo/demo_pdf1.pdf +3 -0
demo/demo_streamlit.py +222 -0
demo/demo_vllm.py +42 -0
demo/launch_model_vllm.sh +17 -0
docker/Dockerfile +4 -0
docker/docker-compose.yml +44 -0
dots.ocr LICENSE AGREEMENT +109 -0
dots_ocr/__init__.py +1 -0
dots_ocr/model/inference.py +50 -0
dots_ocr/parser.py +428 -0
dots_ocr/utils/__init__.py +1 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,123 @@

+# Byte-compiled / optimized / DLL files
+weights/
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# MacOS
+.DS_Store
+# OCR related
+#*.jpg
+# *.jpeg
+#*.png
+#*.pdf
+temp/
+output/
+# playground/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 rednote-hilab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

NOTICE ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,12 +1,1228 @@
----
-title: Dots Ocr
-emoji: 🔥
-colorFrom: purple
-colorTo: green
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<p align="center">
+    <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/logo.png" width="300"/>
+<p>
+<h1 align="center">
+dots.ocr: Multilingual Document Layout Parsing in a Single Vision-Language Model
+</h1>
+[![Blog](https://img.shields.io/badge/Blog-View_on_GitHub-333.svg?logo=github)](https://github.com/rednote-hilab/dots.ocr/blob/master/assets/blog.md)
+[![HuggingFace](https://img.shields.io/badge/HuggingFace%20Weights-black.svg?logo=HuggingFace)](https://huggingface.co/rednote-hilab/dots.ocr)
+<div align="center">
+  <a href="https://dotsocr.xiaohongshu.com" target="_blank" rel="noopener noreferrer"><strong>🖥️ Live Demo</strong></a> |
+  <a href="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/wechat.png" target="_blank" rel="noopener noreferrer"><strong>💬 WeChat</strong></a> |
+  <a href="https://www.xiaohongshu.com/user/profile/683ffe42000000001d021a4c" target="_blank" rel="noopener noreferrer"><strong>📕 rednote</strong></a> |
+  <a href="https://x.com/rednotehilab" target="_blank" rel="noopener noreferrer"><strong>🐦 X</strong></a>
+</div>
+</div>
+## Introduction
+**dots.ocr** is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance.
+1. **Powerful Performance:** **dots.ocr** achieves SOTA performance for text, tables, and reading order on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), while delivering formula recognition results comparable to much larger models like Doubao-1.5 and gemini2.5-pro.
+2. **Multilingual Support:** **dots.ocr** demonstrates robust parsing capabilities for low-resource languages, achieving decisive advantages across both layout detection and content recognition on our in-house multilingual documents benchmark.
+3. **Unified and Simple Architecture:** By leveraging a single vision-language model, **dots.ocr** offers a significantly more streamlined architecture than conventional methods that rely on complex, multi-model pipelines. Switching between tasks is accomplished simply by altering the input prompt, proving that a VLM can achieve competitive detection results compared to traditional detection models like DocLayout-YOLO.
+4.  **Efficient and Fast Performance:** Built upon a compact 1.7B LLM, **dots.ocr** provides faster inference speeds than many other high-performing models based on larger foundations.
+### Performance Comparison: dots.ocr vs. Competing Models
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/chart.png" border="0" />
+> **Notes:**
+> - The EN, ZH metrics are the end2end evaluation results of [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and Multilingual metric is the end2end evaluation results of dots.ocr-bench.
+## News
+* ```2025.07.30 ``` 🚀 We release [dots.ocr](https://github.com/rednote-hilab/dots.ocr), — a multilingual documents parsing model based on 1.7b llm, with SOTA performance.
+## Benchmark Results
+### 1. OmniDocBench
+#### The end-to-end evaluation results of different tasks.
+<table>
+<thead>
+<tr>
+<th rowspan="2"><strong>Model<br>Type</strong></th>
+<th rowspan="2"><strong>Methods</strong></th>
+<th colspan="2"><strong>Overall<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Text<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Formula<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Table<sup>TEDS</sup>↑</strong></th>
+<th colspan="2"><strong>Table<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Read Order<sup>Edit</sup>↓</strong></th>
+</tr>
+<tr>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td rowspan="8"><strong>Pipeline<br>Tools</strong></td>
+<td>MinerU</td>
+<td>0.150</td>
+<td>0.357</td>
+<td>0.061</td>
+<td>0.215</td>
+<td>0.278</td>
+<td>0.577</td>
+<td>78.6</td>
+<td>62.1</td>
+<td>0.180</td>
+<td>0.344</td>
+<td>0.079</td>
+<td>0.292</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>0.336</td>
+<td>0.556</td>
+<td>0.080</td>
+<td>0.315</td>
+<td>0.530</td>
+<td>0.883</td>
+<td>67.6</td>
+<td>49.2</td>
+<td>0.619</td>
+<td>0.685</td>
+<td>0.114</td>
+<td>0.340</td>
+</tr>
+<tr>
+<td>Mathpix</td>
+<td>0.191</td>
+<td>0.365</td>
+<td>0.105</td>
+<td>0.384</td>
+<td>0.306</td>
+<td>0.454</td>
+<td>77.0</td>
+<td>67.1</td>
+<td>0.243</td>
+<td>0.320</td>
+<td>0.108</td>
+<td>0.304</td>
+</tr>
+<tr>
+<td>Docling</td>
+<td>0.589</td>
+<td>0.909</td>
+<td>0.416</td>
+<td>0.987</td>
+<td>0.999</td>
+<td>1</td>
+<td>61.3</td>
+<td>25.0</td>
+<td>0.627</td>
+<td>0.810</td>
+<td>0.313</td>
+<td>0.837</td>
+</tr>
+<tr>
+<td>Pix2Text</td>
+<td>0.320</td>
+<td>0.528</td>
+<td>0.138</td>
+<td>0.356</td>
+<td>0.276</td>
+<td>0.611</td>
+<td>73.6</td>
+<td>66.2</td>
+<td>0.584</td>
+<td>0.645</td>
+<td>0.281</td>
+<td>0.499</td>
+</tr>
+<tr>
+<td>Unstructured</td>
+<td>0.586</td>
+<td>0.716</td>
+<td>0.198</td>
+<td>0.481</td>
+<td>0.999</td>
+<td>1</td>
+<td>0</td>
+<td>0.06</td>
+<td>1</td>
+<td>0.998</td>
+<td>0.145</td>
+<td>0.387</td>
+</tr>
+<tr>
+<td>OpenParse</td>
+<td>0.646</td>
+<td>0.814</td>
+<td>0.681</td>
+<td>0.974</td>
+<td>0.996</td>
+<td>1</td>
+<td>64.8</td>
+<td>27.5</td>
+<td>0.284</td>
+<td>0.639</td>
+<td>0.595</td>
+<td>0.641</td>
+</tr>
+<tr>
+<td>PPStruct-V3</td>
+<td>0.145</td>
+<td>0.206</td>
+<td>0.058</td>
+<td>0.088</td>
+<td>0.295</td>
+<td>0.535</td>
+<td>-</td>
+<td>-</td>
+<td>0.159</td>
+<td>0.109</td>
+<td>0.069</td>
+<td>0.091</td>
+</tr>
+<tr>
+<td rowspan="9"><strong>Expert<br>VLMs</strong></td>
+<td>GOT-OCR</td>
+<td>0.287</td>
+<td>0.411</td>
+<td>0.189</td>
+<td>0.315</td>
+<td>0.360</td>
+<td>0.528</td>
+<td>53.2</td>
+<td>47.2</td>
+<td>0.459</td>
+<td>0.520</td>
+<td>0.141</td>
+<td>0.280</td>
+</tr>
+<tr>
+<td>Nougat</td>
+<td>0.452</td>
+<td>0.973</td>
+<td>0.365</td>
+<td>0.998</td>
+<td>0.488</td>
+<td>0.941</td>
+<td>39.9</td>
+<td>0</td>
+<td>0.572</td>
+<td>1.000</td>
+<td>0.382</td>
+<td>0.954</td>
+</tr>
+<tr>
+<td>Mistral OCR</td>
+<td>0.268</td>
+<td>0.439</td>
+<td>0.072</td>
+<td>0.325</td>
+<td>0.318</td>
+<td>0.495</td>
+<td>75.8</td>
+<td>63.6</td>
+<td>0.600</td>
+<td>0.650</td>
+<td>0.083</td>
+<td>0.284</td>
+</tr>
+<tr>
+<td>OLMOCR-sglang</td>
+<td>0.326</td>
+<td>0.469</td>
+<td>0.097</td>
+<td>0.293</td>
+<td>0.455</td>
+<td>0.655</td>
+<td>68.1</td>
+<td>61.3</td>
+<td>0.608</td>
+<td>0.652</td>
+<td>0.145</td>
+<td>0.277</td>
+</tr>
+<tr>
+<td>SmolDocling-256M</td>
+<td>0.493</td>
+<td>0.816</td>
+<td>0.262</td>
+<td>0.838</td>
+<td>0.753</td>
+<td>0.997</td>
+<td>44.9</td>
+<td>16.5</td>
+<td>0.729</td>
+<td>0.907</td>
+<td>0.227</td>
+<td>0.522</td>
+</tr>
+<tr>
+<td>Dolphin</td>
+<td>0.206</td>
+<td>0.306</td>
+<td>0.107</td>
+<td>0.197</td>
+<td>0.447</td>
+<td>0.580</td>
+<td>77.3</td>
+<td>67.2</td>
+<td>0.180</td>
+<td>0.285</td>
+<td>0.091</td>
+<td>0.162</td>
+</tr>
+<tr>
+<td>MinerU 2</td>
+<td>0.139</td>
+<td>0.240</td>
+<td>0.047</td>
+<td>0.109</td>
+<td>0.297</td>
+<td>0.536</td>
+<td>82.5</td>
+<td>79.0</td>
+<td>0.141</td>
+<td>0.195</td>
+<td>0.069<</td>
+<td>0.118</td>
+</tr>
+<tr>
+<td>OCRFlux</td>
+<td>0.195</td>
+<td>0.281</td>
+<td>0.064</td>
+<td>0.183</td>
+<td>0.379</td>
+<td>0.613</td>
+<td>71.6</td>
+<td>81.3</td>
+<td>0.253</td>
+<td>0.139</td>
+<td>0.086</td>
+<td>0.187</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td>0.138</td>
+<td>0.206</td>
+<td>0.067</td>
+<td>0.107</td>
+<td><strong>0.246</strong></td>
+<td>0.421</td>
+<td>81.5</td>
+<td>87.5</td>
+<td>0.139</td>
+<td>0.111</td>
+<td>0.100</td>
+<td>0.185</td>
+</tr>
+<tr>
+<td rowspan="5"><strong>General<br>VLMs</strong></td>
+<td>GPT4o</td>
+<td>0.233</td>
+<td>0.399</td>
+<td>0.144</td>
+<td>0.409</td>
+<td>0.425</td>
+<td>0.606</td>
+<td>72.0</td>
+<td>62.9</td>
+<td>0.234</td>
+<td>0.329</td>
+<td>0.128</td>
+<td>0.251</td>
+</tr>
+    <tr>
+      <td>Qwen2-VL-72B</td>
+      <td>0.252</td>
+      <td>0.327</td>
+      <td>0.096</td>
+      <td>0.218</td>
+      <td>0.404</td>
+      <td>0.487</td>
+      <td>76.8</td>
+      <td>76.4</td>
+      <td>0.387</td>
+      <td>0.408</td>
+      <td>0.119</td>
+      <td>0.193</td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-VL-72B</td>
+      <td>0.214</td>
+      <td>0.261</td>
+      <td>0.092</td>
+      <td>0.18</td>
+      <td>0.315</td>
+      <td>0.434</td>
+      <td>82.9</td>
+      <td>83.9</td>
+      <td>0.341</td>
+      <td>0.262</td>
+      <td>0.106</td>
+      <td>0.168</td>
+    </tr>
+    <tr>
+      <td>Gemini2.5-Pro</td>
+      <td>0.148</td>
+      <td>0.212</td>
+      <td>0.055</td>
+      <td>0.168</td>
+      <td>0.356</td>
+      <td>0.439</td>
+      <td>85.8</td>
+      <td>86.4</td>
+      <td>0.13</td>
+      <td>0.119</td>
+      <td>0.049</td>
+      <td>0.121</td>
+    </tr>
+    <tr>
+      <td>doubao-1-5-thinking-vision-pro-250428</td>
+      <td>0.140</td>
+      <td>0.162</td>
+      <td>0.043</td>
+      <td>0.085</td>
+      <td>0.295</td>
+      <td><strong>0.384</strong></td>
+      <td>83.3</td>
+      <td><strong>89.3</strong></td>
+      <td>0.165</td>
+      <td><strong>0.085</strong></td>
+      <td>0.058</td>
+      <td>0.094</td>
+    </tr>
+<tr>
+<td rowspan="1"><strong>Expert VLMs</strong></td>
+<td><strong>dots.ocr</strong></td>
+<td><strong>0.125</strong></td>
+<td><strong>0.160</strong></td>
+<td><strong>0.032</strong></td>
+<td><strong>0.066</strong></td>
+<td>0.329</td>
+<td>0.416</td>
+<td><strong>88.6</strong></td>
+<td>89.0</td>
+<td><strong>0.099</strong></td>
+<td>0.092</td>
+<td><strong>0.040</strong></td>
+<td><strong>0.067</strong></td>
+</tr>
+<tr>
+</tbody>
+</table>
+#### The end-to-end text recognition performance across 9 PDF page types.
+<table>
+<thead>
+<tr>
+<th><strong>Model<br>Type</strong></th>
+<th><strong>Models</strong></th>
+<th><strong>Book</strong></th>
+<th><strong>Slides</strong></th>
+<th><strong>Financial<br>Report</strong></th>
+<th><strong>Textbook</strong></th>
+<th><strong>Exam<br>Paper</strong></th>
+<th><strong>Magazine</strong></th>
+<th><strong>Academic<br>Papers</strong></th>
+<th><strong>Notes</strong></th>
+<th><strong>Newspaper</strong></th>
+<th><strong>Overall</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td rowspan="3"><strong>Pipeline<br>Tools</strong></td>
+<td>MinerU</td>
+<td>0.055</td>
+<td>0.124</td>
+<td><u>0.033</u></td>
+<td>0.102</td>
+<td>0.159</td>
+<td><strong>0.072</strong></td>
+<td><u>0.025</u></td>
+<td>0.984</td>
+<td>0.171</td>
+<td>0.206</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>0.074</td>
+<td>0.340</td>
+<td>0.089</td>
+<td>0.319</td>
+<td>0.452</td>
+<td>0.153</td>
+<td>0.059</td>
+<td>0.651</td>
+<td>0.192</td>
+<td>0.274</td>
+</tr>
+<tr>
+<td>Mathpix</td>
+<td>0.131</td>
+<td>0.220</td>
+<td>0.202</td>
+<td>0.216</td>
+<td>0.278</td>
+<td>0.147</td>
+<td>0.091</td>
+<td>0.634</td>
+<td>0.690</td>
+<td>0.300</td>
+</tr>
+<tr>
+<td rowspan="5"><strong>Expert<br>VLMs</strong></td>
+<td>GOT-OCR</td>
+<td>0.111</td>
+<td>0.222</td>
+<td>0.067</td>
+<td>0.132</td>
+<td>0.204</td>
+<td>0.198</td>
+<td>0.179</td>
+<td>0.388</td>
+<td>0.771</td>
+<td>0.267</td>
+</tr>
+<tr>
+<td>Nougat</td>
+<td>0.734</td>
+<td>0.958</td>
+<td>1.000</td>
+<td>0.820</td>
+<td>0.930</td>
+<td>0.830</td>
+<td>0.214</td>
+<td>0.991</td>
+<td>0.871</td>
+<td>0.806</td>
+</tr>
+<tr>
+<td>Dolphin</td>
+<td>0.091</td>
+<td>0.131</td>
+<td>0.057</td>
+<td>0.146</td>
+<td>0.231</td>
+<td>0.121</td>
+<td>0.074</td>
+<td>0.363</td>
+<td>0.307</td>
+<td>0.177</td>
+</tr>
+<tr>
+<td>OCRFlux</td>
+<td>0.068</td>
+<td>0.125</td>
+<td>0.092</td>
+<td>0.102</td>
+<td>0.119</td>
+<td>0.083</td>
+<td>0.047</td>
+<td>0.223</td>
+<td>0.536</td>
+<td>0.149</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td>0.084</td>
+<td>0.129</td>
+<td>0.060</td>
+<td>0.090</td>
+<td>0.107</td>
+<td>0.073</td>
+<td>0.050</td>
+<td>0.171</td>
+<td>0.107</td>
+<td>0.100</td>
+</tr>
+<tr>
+<td rowspan="4"><strong>General<br>VLMs</strong></td>
+<td>GPT4o</td>
+<td>0.157</td>
+<td>0.163</td>
+<td>0.348</td>
+<td>0.187</td>
+<td>0.281</td>
+<td>0.173</td>
+<td>0.146</td>
+<td>0.607</td>
+<td>0.751</td>
+<td>0.316</td>
+</tr>
+<tr>
+<td>Qwen2.5-VL-7B</td>
+<td>0.148</td>
+<td>0.053</td>
+<td>0.111</td>
+<td>0.137</td>
+<td>0.189</td>
+<td>0.117</td>
+<td>0.134</td>
+<td>0.204</td>
+<td>0.706</td>
+<td>0.205</td>
+</tr>
+<tr>
+<td>InternVL3-8B</td>
+<td>0.163</td>
+<td>0.056</td>
+<td>0.107</td>
+<td>0.109</td>
+<td>0.129</td>
+<td>0.100</td>
+<td>0.159</td>
+<td>0.150</td>
+<td>0.681</td>
+<td>0.188</td>
+</tr>
+<tr>
+<td>doubao-1-5-thinking-vision-pro-250428</td>
+<td>0.048</td>
+<td>0.048</td>
+<td>0.024</td>
+<td><strong>0.062</strong></td>
+<td>0.085</td>
+<td>0.051</td>
+<td>0.039</td>
+<td><strong>0.096</strong></td>
+<td>0.181</td>
+<td>0.073</td>
+</tr>
+<tr>
+<td rowspan="1"><strong>Expert VLMs</strong></td>
+<td><strong>dots.ocr</strong></td>
+<td><strong>0.031</strong></td>
+<td><strong>0.047</strong></td>
+<td><strong>0.011</strong></td>
+<td>0.082</td>
+<td><strong>0.079</strong></td>
+<td><strong>0.028</strong></td>
+<td><strong>0.029</strong></td>
+<td>0.109</td>
+<td><strong>0.056</strong></td>
+<td><strong>0.055</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+> - We use tikz_preprocess pipeline to upsample the images to dpi 200.
+### 2. **dots.ocr-bench**
+This is an inhouse benchmark which contain 1493 pdf images with 100 languages.
+#### The end-to-end evaluation results of different tasks.
+<table>
+<thead>
+<tr>
+<th rowspan="1"><strong>Methods</strong></th>
+<th colspan="1"><strong>Overall<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Text<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Formula<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Table<sup>TEDS</sup>↑</strong></th>
+<th colspan="1"><strong>Table<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Read Order<sup>Edit</sup>↓</strong></th>
+</tr>
+</thead>
+<tbody>
+<td>MonkeyOCR-3B</td>
+<td>0.483</td>
+<td>0.445</td>
+<td>0.627</td>
+<td>50.93</td>
+<td>0.452</td>
+<td>0.409</td>
+</tr>
+<tr>
+<td>doubao-1-5-thinking-vision-pro-250428</td>
+<td>0.291</td>
+<td>0.226</td>
+<td>0.440</td>
+<td>71.2</td>
+<td>0.260</td>
+<td>0.238</td>
+</tr>
+<tr>
+<td>doubao-1-6</td>
+<td>0.299</td>
+<td>0.270</td>
+<td>0.417</td>
+<td>71.0</td>
+<td>0.258</td>
+<td>0.253</td>
+</tr>
+<tr>
+<td>Gemini2.5-Pro</td>
+<td>0.251</td>
+<td>0.163</td>
+<td>0.402</td>
+<td>77.1</td>
+<td>0.236</td>
+<td>0.202</td>
+</tr>
+<tr>
+<td><strong>dots.ocr</strong> </td>
+<td><strong>0.177</strong></td>
+<td><strong>0.075</strong></td>
+<td><strong>0.297</strong></td>
+<td><strong>79.2</strong></td>
+<td><strong>0.186</strong></td>
+<td><strong>0.152</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - We use the same metric calculation pipeline of [OmniDocBench](https://github.com/opendatalab/OmniDocBench).
+> - We delete the Page-header and Page-footer cells in the result markdown.
+#### Layout Detection
+<table>
+<thead>
+<tr>
+<th rowspan="2"><strong>Method</strong></th>
+<th colspan="5" style="text-align: center;"><strong>F1@IoU=.50:.05:.95↑</strong></th>
+<th colspan="5" style="text-align: center;"><strong>F1@IoU=.50↑</strong></th>
+</tr>
+<tr>
+<th>Overall</th>
+<th>Text</th>
+<th>Formula</th>
+<th>Table</th>
+<th>Picture</th>
+<th>Overall</th>
+<th>Text</th>
+<th>Formula</th>
+<th>Table</th>
+<th>Picture</th>
+</tr>
+</thead>
+<tbody>
+<td>DocLayout-YOLO-DocStructBench</td>
+<td>0.733</td>
+<td>0.694</td>
+<td>0.480</td>
+<td>0.803</td>
+<td>0.619</td>
+<td>0.806</td>
+<td>0.779</td>
+<td>0.620</td>
+<td>0.858</td>
+<td>0.678</td>
+</tr>
+<tr>
+<td>dots.ocr-parse all</td>
+<td>0.831</td>
+<td>0.801</td>
+<td>0.654</td>
+<td>0.838</td>
+<td>0.748</td>
+<td>0.922</td>
+<td>0.909</td>
+<td>0.770</td>
+<td>0.888</td>
+<td>0.831</td>
+</tr>
+<tr>
+<td> <strong>dots.ocr-detection only</strong> </td>
+<td><strong>0.845</strong></td>
+<td><strong>0.816</strong></td>
+<td><strong>0.716</strong></td>
+<td><strong>0.875</strong></td>
+<td><strong>0.765</strong></td>
+<td><strong>0.930</strong></td>
+<td><strong>0.917</strong></td>
+<td><strong>0.832</strong></td>
+<td><strong>0.918</strong></td>
+<td><strong>0.843</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - prompt_layout_all_en for **parse all**, prompt_layout_only_en for **detection only**, please refer to [prompts](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
+### 3. olmOCR-bench.
+<table>
+<thead>
+<tr>
+<th>Model</th>
+<th>ArXiv</th>
+<th>Old Scans<br>Math</th>
+<th>Tables</th>
+<th>Old Scans</th>
+<th>Headers and<br>Footers</th>
+<th>Multi<br>column</th>
+<th>Long Tiny<br>Text</th>
+<th>Base</th>
+<th>Overall</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>GOT OCR</td>
+<td>52.7</td>
+<td>52.0</td>
+<td>0.2</td>
+<td>22.1</td>
+<td>93.6</td>
+<td>42.0</td>
+<td>29.9</td>
+<td>94.0</td>
+<td>48.3 ± 1.1</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>76.0</td>
+<td>57.9</td>
+<td>57.6</td>
+<td>27.8</td>
+<td>84.9</td>
+<td>72.9</td>
+<td>84.6</td>
+<td>99.1</td>
+<td>70.1 ± 1.1</td>
+</tr>
+<tr>
+<td>MinerU</td>
+<td>75.4</td>
+<td>47.4</td>
+<td>60.9</td>
+<td>17.3</td>
+<td><strong>96.6</strong></td>
+<td>59.0</td>
+<td>39.1</td>
+<td>96.6</td>
+<td>61.5 ± 1.1</td>
+</tr>
+<tr>
+<td>Mistral OCR</td>
+<td>77.2</td>
+<td>67.5</td>
+<td>60.6</td>
+<td>29.3</td>
+<td>93.6</td>
+<td>71.3</td>
+<td>77.1</td>
+<td>99.4</td>
+<td>72.0 ± 1.1</td>
+</tr>
+<tr>
+<td>Nanonets OCR</td>
+<td>67.0</td>
+<td>68.6</td>
+<td>77.7</td>
+<td>39.5</td>
+<td>40.7</td>
+<td>69.9</td>
+<td>53.4</td>
+<td>99.3</td>
+<td>64.5 ± 1.1</td>
+</tr>
+<tr>
+<td>GPT-4o<br>(No Anchor)</td>
+<td>51.5</td>
+<td><strong>75.5</strong></td>
+<td>69.1</td>
+<td>40.9</td>
+<td>94.2</td>
+<td>68.9</td>
+<td>54.1</td>
+<td>96.7</td>
+<td>68.9 ± 1.1</td>
+</tr>
+<tr>
+<td>GPT-4o<br>(Anchored)</td>
+<td>53.5</td>
+<td>74.5</td>
+<td>70.0</td>
+<td>40.7</td>
+<td>93.8</td>
+<td>69.3</td>
+<td>60.6</td>
+<td>96.8</td>
+<td>69.9 ± 1.1</td>
+</tr>
+<tr>
+<td>Gemini Flash 2<br>(No Anchor)</td>
+<td>32.1</td>
+<td>56.3</td>
+<td>61.4</td>
+<td>27.8</td>
+<td>48.0</td>
+<td>58.7</td>
+<td><strong>84.4</strong></td>
+<td>94.0</td>
+<td>57.8 ± 1.1</td>
+</tr>
+<tr>
+<td>Gemini Flash 2<br>(Anchored)</td>
+<td>54.5</td>
+<td>56.1</td>
+<td>72.1</td>
+<td>34.2</td>
+<td>64.7</td>
+<td>61.5</td>
+<td>71.5</td>
+<td>95.6</td>
+<td>63.8 ± 1.2</td>
+</tr>
+<tr>
+<td>Qwen 2 VL<br>(No Anchor)</td>
+<td>19.7</td>
+<td>31.7</td>
+<td>24.2</td>
+<td>17.1</td>
+<td>88.9</td>
+<td>8.3</td>
+<td>6.8</td>
+<td>55.5</td>
+<td>31.5 ± 0.9</td>
+</tr>
+<tr>
+<td>Qwen 2.5 VL<br>(No Anchor)</td>
+<td>63.1</td>
+<td>65.7</td>
+<td>67.3</td>
+<td>38.6</td>
+<td>73.6</td>
+<td>68.3</td>
+<td>49.1</td>
+<td>98.3</td>
+<td>65.5 ± 1.2</td>
+</tr>
+<tr>
+<td>olmOCR v0.1.75<br>(No Anchor)</td>
+<td>71.5</td>
+<td>71.4</td>
+<td>71.4</td>
+<td><strong>42.8</strong></td>
+<td>94.1</td>
+<td>77.7</td>
+<td>71.0</td>
+<td>97.8</td>
+<td>74.7 ± 1.1</td>
+</tr>
+<tr>
+<td>olmOCR v0.1.75<br>(Anchored)</td>
+<td>74.9</td>
+<td>71.2</td>
+<td>71.0</td>
+<td>42.2</td>
+<td>94.5</td>
+<td>78.3</td>
+<td>73.3</td>
+<td>98.3</td>
+<td>75.5 ± 1.0</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td><strong>83.8</strong></td>
+<td>68.8</td>
+<td>74.6</td>
+<td>36.1</td>
+<td>91.2</td>
+<td>76.6</td>
+<td>80.1</td>
+<td>95.3</td>
+<td>75.8 ± 1.0</td>
+</tr>
+<tr>
+<td><strong>dots.ocr</strong></td>
+<td>82.1</td>
+<td>64.2</td>
+<td><strong>88.3</strong></td>
+<td>40.9</td>
+<td>94.1</td>
+<td><strong>82.4</strong></td>
+<td>81.2</td>
+<td><strong>99.5</strong></td>
+<td><strong>79.1 ± 1.0</strong></td>
+</tr>
+</tbody>
+</table>
+> **Note:**
+> - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
+[olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+# Quick Start
+## 1. Installation
+### Install dots.ocr
+```shell
+conda create -n dots_ocr python=3.12
+conda activate dots_ocr
+git clone https://github.com/rednote-hilab/dots.ocr.git
+cd dots.ocr
+# Install pytorch, see https://pytorch.org/get-started/previous-versions/ for your cuda version
+pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+pip install -e .
+```
+If you have trouble with the installation, try our [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) for an easier setup, and follow these steps:
+```shell
+git clone https://github.com/rednote-hilab/dots.ocr.git
+cd dots.ocr
+pip install -e .
+```
+### Download Model Weights
+> 💡**Note:** Please use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
+```shell
+python3 tools/download_model.py
+# with modelscope
+python3 tools/download_model.py --type modelscope
+```
+## 2. Deployment
+### vLLM inference
+We highly recommend using vllm for deployment and inference. All of our evaluations results are based on vllm version 0.9.1.
+The [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) is based on the official vllm image. You can also follow [Dockerfile](https://github.com/rednote-hilab/dots.ocr/blob/master/docker/Dockerfile) to build the deployment environment by yourself.
+```shell
+# You need to register model to vllm at first
+python3 tools/download_model.py
+export hf_model_path=./weights/DotsOCR  # Path to your downloaded model weights, Please use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
+export PYTHONPATH=$(dirname "$hf_model_path"):$PYTHONPATH
+sed -i '/^from vllm\.entrypoints\.cli\.main import main$/a\
+from DotsOCR import modeling_dots_ocr_vllm' `which vllm`  # If you downloaded model weights by yourself, please replace `DotsOCR` by your model saved directory name, and remember to use a directory name without periods (e.g., `DotsOCR` instead of `dots.ocr`)
+# launch vllm server
+CUDA_VISIBLE_DEVICES=0 vllm serve ${hf_model_path} --tensor-parallel-size 1 --gpu-memory-utilization 0.95  --chat-template-content-format string --served-model-name model --trust-remote-code
+# If you get a ModuleNotFoundError: No module named 'DotsOCR', please check the note above on the saved model directory name.
+# vllm api demo
+python3 ./demo/demo_vllm.py --prompt_mode prompt_layout_all_en
+```
+### Hugginface inference
+```shell
+python3 demo/demo_hf.py
+```
+<details>
+<summary><b>Hugginface inference details</b></summary>
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from dots_ocr.utils import dict_promptmode_to_prompt
+model_path = "./weights/DotsOCR"
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+image_path = "demo/demo_image1.jpg"
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+5. Final Output: The entire output must be a single JSON object.
+"""
+messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image_path
+                },
+                {"type": "text", "text": prompt}
+            ]
+        }
+    ]
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=24000)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+</details>
+### Hugginface inference with CPU
+Please refer to [CPU inference](https://github.com/rednote-hilab/dots.ocr/issues/1#issuecomment-3148962536)
+## 3. Document Parse
+**Based on vLLM server**, you can parse an image or a pdf file using the following commands:
+```bash
+# Parse all layout info, both detection and recognition
+# Parse a single image
+python3 dots_ocr/parser.py demo/demo_image1.jpg
+# Parse a single PDF
+python3 dots_ocr/parser.py demo/demo_pdf1.pdf  --num_thread 64  # try bigger num_threads for pdf with a large number of pages
+# Layout detection only
+python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_layout_only_en
+# Parse text only, except Page-header and Page-footer
+python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_ocr
+# Parse layout info by bbox
+python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_grounding_ocr --bbox 163 241 1536 705
+```
+**Based on Transformers**, you can parse an image or a pdf file using the same commands above, just add `--use_hf true`.
+> Notice: transformers is slower than vllm, if you want to use demo/* with transformers，just add `use_hf=True` in `DotsOCRParser(..,use_hf=True)`
+<details>
+<summary><b>Output Results</b></summary>
+1.  **Structured Layout Data** (`demo_image1.json`): A JSON file containing the detected layout elements, including their bounding boxes, categories, and extracted text.
+2.  **Processed Markdown File** (`demo_image1.md`): A Markdown file generated from the concatenated text of all detected cells.
+    *   An additional version, `demo_image1_nohf.md`, is also provided, which excludes page headers and footers for compatibility with benchmarks like Omnidocbench and olmOCR-bench.
+3.  **Layout Visualization** (`demo_image1.jpg`): The original image with the detected layout bounding boxes drawn on it.
+</details>
+## 4. Demo
+You can run the demo with the following command, or try directly at [live demo](https://dotsocr.xiaohongshu.com/)
+```bash
+python demo/demo_gradio.py
+```
+We also provide a demo for grounding ocr:
+```bash
+python demo/demo_gradio_annotion.py
+```
+### Example for formula document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula1.png" alt="formula1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula2.png" alt="formula2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula3.png" alt="formula3.png" border="0" />
+### Example for table document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table1.png" alt="table1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table2.png" alt="table2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table3.png" alt="table3.png" border="0" />
+### Example for multilingual document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/Tibetan.png" alt="Tibetan.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/tradition_zh.png" alt="tradition_zh.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/nl.png" alt="nl.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/kannada.png" alt="kannada.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/russian.png" alt="russian.png" border="0" />
+### Example for reading order
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/reading_order.png" alt="reading_order.png" border="0" />
+### Example for grounding ocr
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/grounding.png" alt="grounding.png" border="0" />
+## Acknowledgments
+We would like to thank [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL), [aimv2](https://github.com/apple/ml-aim), [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
+[OmniDocBench](https://github.com/opendatalab/OmniDocBench), [PyMuPDF](https://github.com/pymupdf/PyMuPDF), for providing code and models.
+We also thank [DocLayNet](https://github.com/DS4SD/DocLayNet), [M6Doc](https://github.com/HCIILAB/M6Doc), [CDLA](https://github.com/buptlihang/CDLA), [D4LA](https://github.com/AlibabaResearch/AdvancedLiterateMachinery) for providing valuable datasets.
+## Limitation & Future Work
+- **Complex Document Elements:**
+  - **Table&Formula**: dots.ocr is not yet perfect for high-complexity tables and formula extraction.
+  - **Picture**: Pictures in documents are currently not parsed.
+- **Parsing Failures:** The model may fail to parse under certain conditions:
+  - When the character-to-pixel ratio is excessively high. Try enlarging the image or increasing the PDF parsing DPI (a setting of 200 is recommended). However, please note that the model performs optimally on images with a resolution under 11289600 pixels.
+  - Continuous special characters, such as ellipses (`...`) and underscores (`_`), may cause the prediction output to repeat endlessly. In such scenarios, consider using alternative prompts like `prompt_layout_only_en`, `prompt_ocr`, or `prompt_grounding_ocr` ([details here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)).
+- **Performance Bottleneck:** Despite its 1.7B parameter LLM foundation, **dots.ocr** is not yet optimized for high-throughput processing of large PDF volumes.
+We are committed to achieving more accurate table and formula parsing, as well as enhancing the model's OCR capabilities for broader generalization, all while aiming for **a more powerful, more efficient model**. Furthermore, we are actively considering the development of **a more general-purpose perception model** based on Vision-Language Models (VLMs), which would integrate general detection, image captioning, and OCR tasks into a unified framework. **Parsing the content of the pictures in the documents** is also a key priority for our future work.
+We believe that collaboration is the key to tackling these exciting challenges. If you are passionate about advancing the frontiers of document intelligence and are interested in contributing to these future endeavors, we would love to hear from you. Please reach out to us via email at: [yanqing4@xiaohongshu.com].

assets/blog.md ADDED Viewed

	@@ -0,0 +1,1044 @@

+<h1 align="center">
+dots.ocr: Multilingual Document Layout Parsing in a Single Vision-Language Model
+</h1>
+## Introduction
+**dots.ocr** is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance.
+1. **Powerful Performance:** **dots.ocr** achieves SOTA performance for text, tables, and reading order on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), while delivering formula recognition results comparable to much larger models like Doubao-1.5 and gemini2.5-pro.
+2. **Multilingual Support:** **dots.ocr** demonstrates robust parsing capabilities for low-resource languages, achieving decisive advantages across both layout detection and content recognition on our in-house multilingual documents benchmark.
+3. **Unified and Simple Architecture:** By leveraging a single vision-language model, **dots.ocr** offers a significantly more streamlined architecture than conventional methods that rely on complex, multi-model pipelines. Switching between tasks is accomplished simply by altering the input prompt, proving that a VLM can achieve competitive detection results compared to traditional detection models like DocLayout-YOLO.
+4.  **Efficient and Fast Performance:** Built upon a compact 1.7B LLM, **dots.ocr** provides faster inference speeds than many other high-performing models based on larger foundations.
+### Performance Comparison on Document Parsing Benchmarks
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/chart.png" border="0" />
+> **Notes:**
+> - The EN, ZH metrics are the end2end evaluation results of [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and Multilingual metric is the end2end evaluation results of dots.ocr-bench.
+## Show Case
+### Example for formula document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula1.png" alt="formula1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula2.png" alt="formula2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula3.png" alt="formula3.png" border="0" />
+### Example for table document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table1.png" alt="table1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table2.png" alt="table2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table3.png" alt="table3.png" border="0" />
+### Example for multilingual document
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/Tibetan.png" alt="Tibetan.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/tradition_zh.png" alt="tradition_zh.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/nl.png" alt="nl.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/kannada.png" alt="kannada.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/russian.png" alt="russian.png" border="0" />
+### Example for reading order
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/reading_order.png" alt="reading_order.png" border="0" />
+### Example for grounding ocr
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/grounding.png" alt="grounding.png" border="0" />
+## Benchmark Results
+### 1. OmniDocBench
+#### The end-to-end evaluation results of different tasks.
+<table>
+<thead>
+<tr>
+<th rowspan="2"><strong>Model<br>Type</strong></th>
+<th rowspan="2"><strong>Methods</strong></th>
+<th colspan="2"><strong>Overall<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Text<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Formula<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Table<sup>TEDS</sup>↑</strong></th>
+<th colspan="2"><strong>Table<sup>Edit</sup>↓</strong></th>
+<th colspan="2"><strong>Read Order<sup>Edit</sup>↓</strong></th>
+</tr>
+<tr>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+<th><em>EN</em></th>
+<th><em>ZH</em></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td rowspan="8"><strong>Pipeline<br>Tools</strong></td>
+<td>MinerU</td>
+<td>0.150</td>
+<td>0.357</td>
+<td>0.061</td>
+<td>0.215</td>
+<td>0.278</td>
+<td>0.577</td>
+<td>78.6</td>
+<td>62.1</td>
+<td>0.180</td>
+<td>0.344</td>
+<td>0.079</td>
+<td>0.292</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>0.336</td>
+<td>0.556</td>
+<td>0.080</td>
+<td>0.315</td>
+<td>0.530</td>
+<td>0.883</td>
+<td>67.6</td>
+<td>49.2</td>
+<td>0.619</td>
+<td>0.685</td>
+<td>0.114</td>
+<td>0.340</td>
+</tr>
+<tr>
+<td>Mathpix</td>
+<td>0.191</td>
+<td>0.365</td>
+<td>0.105</td>
+<td>0.384</td>
+<td>0.306</td>
+<td>0.454</td>
+<td>77.0</td>
+<td>67.1</td>
+<td>0.243</td>
+<td>0.320</td>
+<td>0.108</td>
+<td>0.304</td>
+</tr>
+<tr>
+<td>Docling</td>
+<td>0.589</td>
+<td>0.909</td>
+<td>0.416</td>
+<td>0.987</td>
+<td>0.999</td>
+<td>1</td>
+<td>61.3</td>
+<td>25.0</td>
+<td>0.627</td>
+<td>0.810</td>
+<td>0.313</td>
+<td>0.837</td>
+</tr>
+<tr>
+<td>Pix2Text</td>
+<td>0.320</td>
+<td>0.528</td>
+<td>0.138</td>
+<td>0.356</td>
+<td>0.276</td>
+<td>0.611</td>
+<td>73.6</td>
+<td>66.2</td>
+<td>0.584</td>
+<td>0.645</td>
+<td>0.281</td>
+<td>0.499</td>
+</tr>
+<tr>
+<td>Unstructured</td>
+<td>0.586</td>
+<td>0.716</td>
+<td>0.198</td>
+<td>0.481</td>
+<td>0.999</td>
+<td>1</td>
+<td>0</td>
+<td>0.06</td>
+<td>1</td>
+<td>0.998</td>
+<td>0.145</td>
+<td>0.387</td>
+</tr>
+<tr>
+<td>OpenParse</td>
+<td>0.646</td>
+<td>0.814</td>
+<td>0.681</td>
+<td>0.974</td>
+<td>0.996</td>
+<td>1</td>
+<td>64.8</td>
+<td>27.5</td>
+<td>0.284</td>
+<td>0.639</td>
+<td>0.595</td>
+<td>0.641</td>
+</tr>
+<tr>
+<td>PPStruct-V3</td>
+<td>0.145</td>
+<td>0.206</td>
+<td>0.058</td>
+<td>0.088</td>
+<td>0.295</td>
+<td>0.535</td>
+<td>-</td>
+<td>-</td>
+<td>0.159</td>
+<td>0.109</td>
+<td>0.069</td>
+<td>0.091</td>
+</tr>
+<tr>
+<td rowspan="9"><strong>Expert<br>VLMs</strong></td>
+<td>GOT-OCR</td>
+<td>0.287</td>
+<td>0.411</td>
+<td>0.189</td>
+<td>0.315</td>
+<td>0.360</td>
+<td>0.528</td>
+<td>53.2</td>
+<td>47.2</td>
+<td>0.459</td>
+<td>0.520</td>
+<td>0.141</td>
+<td>0.280</td>
+</tr>
+<tr>
+<td>Nougat</td>
+<td>0.452</td>
+<td>0.973</td>
+<td>0.365</td>
+<td>0.998</td>
+<td>0.488</td>
+<td>0.941</td>
+<td>39.9</td>
+<td>0</td>
+<td>0.572</td>
+<td>1.000</td>
+<td>0.382</td>
+<td>0.954</td>
+</tr>
+<tr>
+<td>Mistral OCR</td>
+<td>0.268</td>
+<td>0.439</td>
+<td>0.072</td>
+<td>0.325</td>
+<td>0.318</td>
+<td>0.495</td>
+<td>75.8</td>
+<td>63.6</td>
+<td>0.600</td>
+<td>0.650</td>
+<td>0.083</td>
+<td>0.284</td>
+</tr>
+<tr>
+<td>OLMOCR-sglang</td>
+<td>0.326</td>
+<td>0.469</td>
+<td>0.097</td>
+<td>0.293</td>
+<td>0.455</td>
+<td>0.655</td>
+<td>68.1</td>
+<td>61.3</td>
+<td>0.608</td>
+<td>0.652</td>
+<td>0.145</td>
+<td>0.277</td>
+</tr>
+<tr>
+<td>SmolDocling-256M</td>
+<td>0.493</td>
+<td>0.816</td>
+<td>0.262</td>
+<td>0.838</td>
+<td>0.753</td>
+<td>0.997</td>
+<td>44.9</td>
+<td>16.5</td>
+<td>0.729</td>
+<td>0.907</td>
+<td>0.227</td>
+<td>0.522</td>
+</tr>
+<tr>
+<td>Dolphin</td>
+<td>0.206</td>
+<td>0.306</td>
+<td>0.107</td>
+<td>0.197</td>
+<td>0.447</td>
+<td>0.580</td>
+<td>77.3</td>
+<td>67.2</td>
+<td>0.180</td>
+<td>0.285</td>
+<td>0.091</td>
+<td>0.162</td>
+</tr>
+<tr>
+<td>MinerU 2</td>
+<td>0.139</td>
+<td>0.240</td>
+<td>0.047</td>
+<td>0.109</td>
+<td>0.297</td>
+<td>0.536</td>
+<td>82.5</td>
+<td>79.0</td>
+<td>0.141</td>
+<td>0.195</td>
+<td>0.069<</td>
+<td>0.118</td>
+</tr>
+<tr>
+<td>OCRFlux</td>
+<td>0.195</td>
+<td>0.281</td>
+<td>0.064</td>
+<td>0.183</td>
+<td>0.379</td>
+<td>0.613</td>
+<td>71.6</td>
+<td>81.3</td>
+<td>0.253</td>
+<td>0.139</td>
+<td>0.086</td>
+<td>0.187</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td>0.138</td>
+<td>0.206</td>
+<td>0.067</td>
+<td>0.107</td>
+<td><strong>0.246</strong></td>
+<td>0.421</td>
+<td>81.5</td>
+<td>87.5</td>
+<td>0.139</td>
+<td>0.111</td>
+<td>0.100</td>
+<td>0.185</td>
+</tr>
+<tr>
+<td rowspan="5"><strong>General<br>VLMs</strong></td>
+<td>GPT4o</td>
+<td>0.233</td>
+<td>0.399</td>
+<td>0.144</td>
+<td>0.409</td>
+<td>0.425</td>
+<td>0.606</td>
+<td>72.0</td>
+<td>62.9</td>
+<td>0.234</td>
+<td>0.329</td>
+<td>0.128</td>
+<td>0.251</td>
+</tr>
+    <tr>
+      <td>Qwen2-VL-72B</td>
+      <td>0.252</td>
+      <td>0.327</td>
+      <td>0.096</td>
+      <td>0.218</td>
+      <td>0.404</td>
+      <td>0.487</td>
+      <td>76.8</td>
+      <td>76.4</td>
+      <td>0.387</td>
+      <td>0.408</td>
+      <td>0.119</td>
+      <td>0.193</td>
+    </tr>
+    <tr>
+      <td>Qwen2.5-VL-72B</td>
+      <td>0.214</td>
+      <td>0.261</td>
+      <td>0.092</td>
+      <td>0.18</td>
+      <td>0.315</td>
+      <td>0.434</td>
+      <td>82.9</td>
+      <td>83.9</td>
+      <td>0.341</td>
+      <td>0.262</td>
+      <td>0.106</td>
+      <td>0.168</td>
+    </tr>
+    <tr>
+      <td>Gemini2.5-Pro</td>
+      <td>0.148</td>
+      <td>0.212</td>
+      <td>0.055</td>
+      <td>0.168</td>
+      <td>0.356</td>
+      <td>0.439</td>
+      <td>85.8</td>
+      <td>86.4</td>
+      <td>0.13</td>
+      <td>0.119</td>
+      <td>0.049</td>
+      <td>0.121</td>
+    </tr>
+    <tr>
+      <td>doubao-1-5-thinking-vision-pro-250428</td>
+      <td>0.140</td>
+      <td>0.162</td>
+      <td>0.043</td>
+      <td>0.085</td>
+      <td>0.295</td>
+      <td><strong>0.384</strong></td>
+      <td>83.3</td>
+      <td><strong>89.3</strong></td>
+      <td>0.165</td>
+      <td><strong>0.085</strong></td>
+      <td>0.058</td>
+      <td>0.094</td>
+    </tr>
+<tr>
+<td rowspan="1"><strong>Expert VLMs</strong></td>
+<td><strong>dots.ocr</strong></td>
+<td><strong>0.125</strong></td>
+<td><strong>0.160</strong></td>
+<td><strong>0.032</strong></td>
+<td><strong>0.066</strong></td>
+<td>0.329</td>
+<td>0.416</td>
+<td><strong>88.6</strong></td>
+<td>89.0</td>
+<td><strong>0.099</strong></td>
+<td>0.092</td>
+<td><strong>0.040</strong></td>
+<td><strong>0.067</strong></td>
+</tr>
+<tr>
+</tbody>
+</table>
+#### The end-to-end text recognition performance across 9 PDF page types.
+<table>
+<thead>
+<tr>
+<th><strong>Model<br>Type</strong></th>
+<th><strong>Models</strong></th>
+<th><strong>Book</strong></th>
+<th><strong>Slides</strong></th>
+<th><strong>Financial<br>Report</strong></th>
+<th><strong>Textbook</strong></th>
+<th><strong>Exam<br>Paper</strong></th>
+<th><strong>Magazine</strong></th>
+<th><strong>Academic<br>Papers</strong></th>
+<th><strong>Notes</strong></th>
+<th><strong>Newspaper</strong></th>
+<th><strong>Overall</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td rowspan="3"><strong>Pipeline<br>Tools</strong></td>
+<td>MinerU</td>
+<td>0.055</td>
+<td>0.124</td>
+<td><u>0.033</u></td>
+<td>0.102</td>
+<td>0.159</td>
+<td><strong>0.072</strong></td>
+<td><u>0.025</u></td>
+<td>0.984</td>
+<td>0.171</td>
+<td>0.206</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>0.074</td>
+<td>0.340</td>
+<td>0.089</td>
+<td>0.319</td>
+<td>0.452</td>
+<td>0.153</td>
+<td>0.059</td>
+<td>0.651</td>
+<td>0.192</td>
+<td>0.274</td>
+</tr>
+<tr>
+<td>Mathpix</td>
+<td>0.131</td>
+<td>0.220</td>
+<td>0.202</td>
+<td>0.216</td>
+<td>0.278</td>
+<td>0.147</td>
+<td>0.091</td>
+<td>0.634</td>
+<td>0.690</td>
+<td>0.300</td>
+</tr>
+<tr>
+<td rowspan="5"><strong>Expert<br>VLMs</strong></td>
+<td>GOT-OCR</td>
+<td>0.111</td>
+<td>0.222</td>
+<td>0.067</td>
+<td>0.132</td>
+<td>0.204</td>
+<td>0.198</td>
+<td>0.179</td>
+<td>0.388</td>
+<td>0.771</td>
+<td>0.267</td>
+</tr>
+<tr>
+<td>Nougat</td>
+<td>0.734</td>
+<td>0.958</td>
+<td>1.000</td>
+<td>0.820</td>
+<td>0.930</td>
+<td>0.830</td>
+<td>0.214</td>
+<td>0.991</td>
+<td>0.871</td>
+<td>0.806</td>
+</tr>
+<tr>
+<td>Dolphin</td>
+<td>0.091</td>
+<td>0.131</td>
+<td>0.057</td>
+<td>0.146</td>
+<td>0.231</td>
+<td>0.121</td>
+<td>0.074</td>
+<td>0.363</td>
+<td>0.307</td>
+<td>0.177</td>
+</tr>
+<tr>
+<td>OCRFlux</td>
+<td>0.068</td>
+<td>0.125</td>
+<td>0.092</td>
+<td>0.102</td>
+<td>0.119</td>
+<td>0.083</td>
+<td>0.047</td>
+<td>0.223</td>
+<td>0.536</td>
+<td>0.149</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td>0.084</td>
+<td>0.129</td>
+<td>0.060</td>
+<td>0.090</td>
+<td>0.107</td>
+<td>0.073</td>
+<td>0.050</td>
+<td>0.171</td>
+<td>0.107</td>
+<td>0.100</td>
+</tr>
+<tr>
+<td rowspan="4"><strong>General<br>VLMs</strong></td>
+<td>GPT4o</td>
+<td>0.157</td>
+<td>0.163</td>
+<td>0.348</td>
+<td>0.187</td>
+<td>0.281</td>
+<td>0.173</td>
+<td>0.146</td>
+<td>0.607</td>
+<td>0.751</td>
+<td>0.316</td>
+</tr>
+<tr>
+<td>Qwen2.5-VL-7B</td>
+<td>0.148</td>
+<td>0.053</td>
+<td>0.111</td>
+<td>0.137</td>
+<td>0.189</td>
+<td>0.117</td>
+<td>0.134</td>
+<td>0.204</td>
+<td>0.706</td>
+<td>0.205</td>
+</tr>
+<tr>
+<td>InternVL3-8B</td>
+<td>0.163</td>
+<td>0.056</td>
+<td>0.107</td>
+<td>0.109</td>
+<td>0.129</td>
+<td>0.100</td>
+<td>0.159</td>
+<td>0.150</td>
+<td>0.681</td>
+<td>0.188</td>
+</tr>
+<tr>
+<td>doubao-1-5-thinking-vision-pro-250428</td>
+<td>0.048</td>
+<td>0.048</td>
+<td>0.024</td>
+<td><strong>0.062</strong></td>
+<td>0.085</td>
+<td>0.051</td>
+<td>0.039</td>
+<td><strong>0.096</strong></td>
+<td>0.181</td>
+<td>0.073</td>
+</tr>
+<tr>
+<td rowspan="1"><strong>Expert VLMs</strong></td>
+<td><strong>dots.ocr</strong></td>
+<td><strong>0.031</strong></td>
+<td><strong>0.047</strong></td>
+<td><strong>0.011</strong></td>
+<td>0.082</td>
+<td><strong>0.079</strong></td>
+<td><strong>0.028</strong></td>
+<td><strong>0.029</strong></td>
+<td>0.109</td>
+<td><strong>0.056</strong></td>
+<td><strong>0.055</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+> - We use tikz_preprocess pipeline to upsample the images to dpi 200.
+### 2. **dots.ocr-bench**
+This is an inhouse benchmark which contain 1493 pdf images with 100 languages.
+#### The end-to-end evaluation results of different tasks.
+<table>
+<thead>
+<tr>
+<th rowspan="1"><strong>Methods</strong></th>
+<th colspan="1"><strong>Overall<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Text<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Formula<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Table<sup>TEDS</sup>↑</strong></th>
+<th colspan="1"><strong>Table<sup>Edit</sup>↓</strong></th>
+<th colspan="1"><strong>Read Order<sup>Edit</sup>↓</strong></th>
+</tr>
+</thead>
+<tbody>
+<td>MonkeyOCR-3B</td>
+<td>0.483</td>
+<td>0.445</td>
+<td>0.627</td>
+<td>50.93</td>
+<td>0.452</td>
+<td>0.409</td>
+</tr>
+<tr>
+<td>doubao-1-5-thinking-vision-pro-250428</td>
+<td>0.291</td>
+<td>0.226</td>
+<td>0.440</td>
+<td>71.2</td>
+<td>0.260</td>
+<td>0.238</td>
+</tr>
+<tr>
+<td>doubao-1-6</td>
+<td>0.299</td>
+<td>0.270</td>
+<td>0.417</td>
+<td>71.0</td>
+<td>0.258</td>
+<td>0.253</td>
+</tr>
+<tr>
+<td>Gemini2.5-Pro</td>
+<td>0.251</td>
+<td>0.163</td>
+<td>0.402</td>
+<td>77.1</td>
+<td>0.236</td>
+<td>0.202</td>
+</tr>
+<tr>
+<td><strong>dots.ocr</strong> </td>
+<td><strong>0.177</strong></td>
+<td><strong>0.075</strong></td>
+<td><strong>0.297</strong></td>
+<td><strong>79.2</strong></td>
+<td><strong>0.186</strong></td>
+<td><strong>0.152</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - We use the same metric calculation pipeline of [OmniDocBench](https://github.com/opendatalab/OmniDocBench).
+> - We delete the Page-header and Page-footer cells in the result markdown.
+#### Layout Detection
+<table>
+<thead>
+<tr>
+<th rowspan="2"><strong>Method</strong></th>
+<th colspan="5" style="text-align: center;"><strong>F1@IoU=.50:.05:.95↑</strong></th>
+<th colspan="5" style="text-align: center;"><strong>F1@IoU=.50↑</strong></th>
+</tr>
+<tr>
+<th>Overall</th>
+<th>Text</th>
+<th>Formula</th>
+<th>Table</th>
+<th>Picture</th>
+<th>Overall</th>
+<th>Text</th>
+<th>Formula</th>
+<th>Table</th>
+<th>Picture</th>
+</tr>
+</thead>
+<tbody>
+<td>DocLayout-YOLO-DocStructBench</td>
+<td>0.733</td>
+<td>0.694</td>
+<td>0.480</td>
+<td>0.803</td>
+<td>0.619</td>
+<td>0.806</td>
+<td>0.779</td>
+<td>0.620</td>
+<td>0.858</td>
+<td>0.678</td>
+</tr>
+<tr>
+<td>dots.ocr-parse all</td>
+<td>0.831</td>
+<td>0.801</td>
+<td>0.654</td>
+<td>0.838</td>
+<td>0.748</td>
+<td>0.922</td>
+<td>0.909</td>
+<td>0.770</td>
+<td>0.888</td>
+<td>0.831</td>
+</tr>
+<tr>
+<td> <strong>dots.ocr-detection only</strong> </td>
+<td><strong>0.845</strong></td>
+<td><strong>0.816</strong></td>
+<td><strong>0.716</strong></td>
+<td><strong>0.875</strong></td>
+<td><strong>0.765</strong></td>
+<td><strong>0.930</strong></td>
+<td><strong>0.917</strong></td>
+<td><strong>0.832</strong></td>
+<td><strong>0.918</strong></td>
+<td><strong>0.843</strong></td>
+</tr>
+</tbody>
+</table>
+> **Notes:**
+> - prompt_layout_all_en for **parse all**, prompt_layout_only_en for **detection only**, please refer to [prompts](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
+### 3. olmOCR-bench.
+<table>
+<thead>
+<tr>
+<th>Model</th>
+<th>ArXiv</th>
+<th>Old Scans<br>Math</th>
+<th>Tables</th>
+<th>Old Scans</th>
+<th>Headers and<br>Footers</th>
+<th>Multi<br>column</th>
+<th>Long Tiny<br>Text</th>
+<th>Base</th>
+<th>Overall</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>GOT OCR</td>
+<td>52.7</td>
+<td>52.0</td>
+<td>0.2</td>
+<td>22.1</td>
+<td>93.6</td>
+<td>42.0</td>
+<td>29.9</td>
+<td>94.0</td>
+<td>48.3 ± 1.1</td>
+</tr>
+<tr>
+<td>Marker</td>
+<td>76.0</td>
+<td>57.9</td>
+<td>57.6</td>
+<td>27.8</td>
+<td>84.9</td>
+<td>72.9</td>
+<td>84.6</td>
+<td>99.1</td>
+<td>70.1 ± 1.1</td>
+</tr>
+<tr>
+<td>MinerU</td>
+<td>75.4</td>
+<td>47.4</td>
+<td>60.9</td>
+<td>17.3</td>
+<td><strong>96.6</strong></td>
+<td>59.0</td>
+<td>39.1</td>
+<td>96.6</td>
+<td>61.5 ± 1.1</td>
+</tr>
+<tr>
+<td>Mistral OCR</td>
+<td>77.2</td>
+<td>67.5</td>
+<td>60.6</td>
+<td>29.3</td>
+<td>93.6</td>
+<td>71.3</td>
+<td>77.1</td>
+<td>99.4</td>
+<td>72.0 ± 1.1</td>
+</tr>
+<tr>
+<td>Nanonets OCR</td>
+<td>67.0</td>
+<td>68.6</td>
+<td><strong>77.7</strong></td>
+<td>39.5</td>
+<td>40.7</td>
+<td>69.9</td>
+<td>53.4</td>
+<td>99.3</td>
+<td>64.5 ± 1.1</td>
+</tr>
+<tr>
+<td>GPT-4o<br>(No Anchor)</td>
+<td>51.5</td>
+<td><strong>75.5</strong></td>
+<td>69.1</td>
+<td>40.9</td>
+<td>94.2</td>
+<td>68.9</td>
+<td>54.1</td>
+<td>96.7</td>
+<td>68.9 ± 1.1</td>
+</tr>
+<tr>
+<td>GPT-4o<br>(Anchored)</td>
+<td>53.5</td>
+<td>74.5</td>
+<td>70.0</td>
+<td>40.7</td>
+<td>93.8</td>
+<td>69.3</td>
+<td>60.6</td>
+<td>96.8</td>
+<td>69.9 ± 1.1</td>
+</tr>
+<tr>
+<td>Gemini Flash 2<br>(No Anchor)</td>
+<td>32.1</td>
+<td>56.3</td>
+<td>61.4</td>
+<td>27.8</td>
+<td>48.0</td>
+<td>58.7</td>
+<td><strong>84.4</strong></td>
+<td>94.0</td>
+<td>57.8 ± 1.1</td>
+</tr>
+<tr>
+<td>Gemini Flash 2<br>(Anchored)</td>
+<td>54.5</td>
+<td>56.1</td>
+<td>72.1</td>
+<td>34.2</td>
+<td>64.7</td>
+<td>61.5</td>
+<td>71.5</td>
+<td>95.6</td>
+<td>63.8 ± 1.2</td>
+</tr>
+<tr>
+<td>Qwen 2 VL<br>(No Anchor)</td>
+<td>19.7</td>
+<td>31.7</td>
+<td>24.2</td>
+<td>17.1</td>
+<td>88.9</td>
+<td>8.3</td>
+<td>6.8</td>
+<td>55.5</td>
+<td>31.5 ± 0.9</td>
+</tr>
+<tr>
+<td>Qwen 2.5 VL<br>(No Anchor)</td>
+<td>63.1</td>
+<td>65.7</td>
+<td>67.3</td>
+<td>38.6</td>
+<td>73.6</td>
+<td>68.3</td>
+<td>49.1</td>
+<td>98.3</td>
+<td>65.5 ± 1.2</td>
+</tr>
+<tr>
+<td>olmOCR v0.1.75<br>(No Anchor)</td>
+<td>71.5</td>
+<td>71.4</td>
+<td>71.4</td>
+<td><strong>42.8</strong></td>
+<td>94.1</td>
+<td>77.7</td>
+<td>71.0</td>
+<td>97.8</td>
+<td>74.7 ± 1.1</td>
+</tr>
+<tr>
+<td>olmOCR v0.1.75<br>(Anchored)</td>
+<td>74.9</td>
+<td>71.2</td>
+<td>71.0</td>
+<td>42.2</td>
+<td>94.5</td>
+<td>78.3</td>
+<td>73.3</td>
+<td>98.3</td>
+<td>75.5 ± 1.0</td>
+</tr>
+<tr>
+<td>MonkeyOCR-pro-3B</td>
+<td><strong>83.8</strong></td>
+<td>68.8</td>
+<td>74.6</td>
+<td>36.1</td>
+<td>91.2</td>
+<td>76.6</td>
+<td>80.1</td>
+<td>95.3</td>
+<td>75.8 ± 1.0</td>
+</tr>
+<tr>
+<td><strong>dots.ocr</strong></td>
+<td>82.1</td>
+<td>64.2</td>
+<td><strong>88.3</strong></td>
+<td>40.9</td>
+<td>94.1</td>
+<td><strong>82.4</strong></td>
+<td>81.2</td>
+<td><strong>99.5</strong></td>
+<td><strong>79.1 ± 1.0</strong></td>
+</tr>
+</tbody>
+</table>
+> **Note:**
+> - The metrics are from [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR),
+[olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+## Methods
+### Pretrain
+We developed a foundational Vision-Language Model (VLM) through a three-stage training process:
+*   **Stage1: Vision Encoder Pre-training**
+    We trained a 1.2-billion-parameter Vision Encoder (VE) from scratch on a vast and comprehensive dataset of image-text pairs.
+*   **Stage2: VE Continued Pre-training**
+    We incorporated additional visual data, including OCR, video, grounding data, etc. Leveraging the `NaViT` architecture, our model supports high-resolution inputs of up to 11 million pixels. The VE was then aligned with the `Qwen2.5-1.5B` language model and trained on this diverse visual data with LLM frozen, which resulted in our general vision encoder `dots.vit`.
+*   **Stage3: VLM Specialization for OCR**
+    We then used a pure OCR dataset for training. To improve training efficiency, we first trained on a certain volume of tokens with the VE parameters frozen. Subsequently, we unfroze all parameters and continued training on an additional one-fifth of that token volume, which produced our foundational OCR model, `dots.ocr.base`.
+### SFT
+The SFT stage was implemented on the following key strategies:
+*   **Diverse SFT Dataset:** We constructed a dataset of nearly 300,000 samples, integrating our in-house manual annotations, synthetic data (tables, formulas, multilingual OCR), as well as open-source datasets.
+*   **Iterative Data Flywheel:** We employed a feedback loop to build an inhouse multilingual structured layout data with 15k samples. This process, repeated over three iterations, involved:
+    *   Sampling "bad cases" based on model performance.
+    *   Manually annotating these cases.
+    *   Adding them back into the training set.
+*   **Reading Order:** We corrected the sequence of all layout element boxes to establish the correct reading order. This was primarily done using larger models for sorting, supplemented by rule-based post-processing methods. We found that with sufficient data diversity and quality, training the model on a list of elements sorted in their natural reading order yields excellent results.
+*   **Quality and Robustness:** We build a multi-expert system for data cleaning and distillation, and applied data augmentation (resizing, rotation, noise) to improve model robustness.
+*   **Multitask training:** We leveraged a single source of structured layout data to generate the SFT data with a variety of prompts. This approach enables the model to perform different tasks, such as detection and recognition, based on the specific prompt provided.
+The resulting `dots.ocr` model demonstrates performance on par with models possessing significantly more parameters.
+## Limitation & Future Work
+- **Complex Document Elements:**
+  - **Table&Formula**: dots.ocr is not yet perfect for high-complexity tables and formula extraction.
+  - **Picture**: Pictures in documents are currently not parsed.
+- **Parsing Failures:** The model may fail to parse under certain conditions:
+  - When the character-to-pixel ratio is excessively high. Try enlarging the image or increasing the PDF parsing DPI (a setting of 200 is recommended). However, please note that the model performs optimally on images with a resolution under 11289600 pixels.
+  - Continuous special characters, such as ellipses (`...`) and underscores (`_`), may cause the prediction output to repeat endlessly. In such scenarios, consider using alternative prompts like `prompt_layout_only_en`, `prompt_ocr`, or `prompt_grounding_ocr` ([details here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)).
+- **Performance Bottleneck:** Despite its 1.7B parameter LLM foundation, **dots.ocr** is not yet optimized for high-throughput processing of large PDF volumes.
+We are committed to achieving more accurate table and formula parsing, as well as enhancing the model's OCR capabilities for broader generalization, all while aiming for **a more powerful, more efficient model**. Furthermore, we are actively considering the development of **a more general-purpose perception model** based on Vision-Language Models (VLMs), which would integrate general detection, image captioning, and OCR tasks into a unified framework. **Parsing the content of the pictures in the documents** is also a key priority for our future work.
+We believe that collaboration is the key to tackling these exciting challenges. If you are passionate about advancing the frontiers of document intelligence and are interested in contributing to these future endeavors, we would love to hear from you. Please reach out to us via email at: [yanqing4@xiaohongshu.com].
+## Author List
+### Contributors
+Mi Jian, Yumeng Li, Bowen Wang, Xiaomin He, Zheyuan Gu
+### Project Leader
+Qing Yan
+### Advisor
+Colin Zhang, Lei Zhang

assets/chart.png ADDED Viewed

Git LFS Details

SHA256: 0576d51813061c25f36c0fcbca837fed1a1d8e06042f2b352be4bdc7b7b5cab1
Pointer size: 130 Bytes
Size of remote file: 64.5 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: ad0b70b18bbf2fb7ad1a838437c1c6069eeb3fdf2df42f7299ec9abeb3427ae4
Pointer size: 130 Bytes
Size of remote file: 67.2 kB

assets/showcase/Tibetan.png ADDED Viewed

Git LFS Details

SHA256: 97bdb98172dc2d5c6a4668188588eb15cc33ecd042f9d9b8224ea933229741ce
Pointer size: 132 Bytes
Size of remote file: 2.89 MB

assets/showcase/formula1.png ADDED Viewed

Git LFS Details

SHA256: 5f7196032f7c4cc6aad9112ba4edeca6e1c3b303c34828711e107f0bb6603c44
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

assets/showcase/formula2.png ADDED Viewed

Git LFS Details

SHA256: a6edff564ee572a17062a2356eb6d83b98fc15e8bf1544b554f62003ce3ec98b
Pointer size: 132 Bytes
Size of remote file: 1.74 MB

assets/showcase/formula3.png ADDED Viewed

Git LFS Details

SHA256: 45b6331b43e3b11d0af4674f021f04c9b9e4e096cf533c8f5f8a15d46261982f
Pointer size: 132 Bytes
Size of remote file: 1.08 MB

assets/showcase/grounding.png ADDED Viewed

Git LFS Details

SHA256: a11a2b2feba8208820ec35c8036c1ee5c0588ce9c9010a4e9ce7901c7cb65e8a
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/showcase/kannada.png ADDED Viewed

Git LFS Details

SHA256: 96f0d36e3e0b08029903066a931defe9ddf002e515d7c63262dcbeeb6b86b32a
Pointer size: 132 Bytes
Size of remote file: 1.92 MB

assets/showcase/nl.png ADDED Viewed

Git LFS Details

SHA256: 53e3bd10e4a85b9dfdbb3fc3b192c47f9834101dc224d4d979c145a0a574c700
Pointer size: 132 Bytes
Size of remote file: 3.84 MB

assets/showcase/reading_order.png ADDED Viewed

Git LFS Details

SHA256: 916b8cd5833ec7bbbd896771537ab66aa96a9c7f70e52685d7df533b6b0cbd2a
Pointer size: 132 Bytes
Size of remote file: 2.9 MB

assets/showcase/russian.png ADDED Viewed

Git LFS Details

SHA256: 307f66b083df466e5a84b049e6d5cf8117050d6e1a612dc2b2fe7f2c0e996b9c
Pointer size: 132 Bytes
Size of remote file: 3.06 MB

assets/showcase/table1.png ADDED Viewed

Git LFS Details

SHA256: b0f75ef4c9a995a8cd29585dc7e9714fa9cb0e98490ededc745094e6c9dfd375
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

assets/showcase/table2.png ADDED Viewed

Git LFS Details

SHA256: d6084dac8845096749ba98191552182b98bde72806577b693d02069a1cc91b5b
Pointer size: 132 Bytes
Size of remote file: 1.77 MB

assets/showcase/table3.png ADDED Viewed

Git LFS Details

SHA256: c42c3b33230d4d00f83b41cb22a9f21511de138bbbc4ce04c62aa916eed53428
Pointer size: 132 Bytes
Size of remote file: 1.51 MB

assets/showcase/tradition_zh.png ADDED Viewed

Git LFS Details

SHA256: dfe7892659fdb07733ba102eeb55f2532a604194596eabb81b28d847a8127e50
Pointer size: 132 Bytes
Size of remote file: 1.87 MB

assets/showcase_origin/Tibetan.png ADDED Viewed

Git LFS Details

SHA256: a761e2eeb987ea3c08ade69c9ffe5781d7e9a06828a1abd474c63b7f27e6d278
Pointer size: 131 Bytes
Size of remote file: 966 kB

assets/showcase_origin/formula_1.jpg ADDED Viewed

Git LFS Details

SHA256: 5b01fa0b9f47e2b0de6b67e02dc869600c8d715b98a952e05868a86d958348ce
Pointer size: 131 Bytes
Size of remote file: 677 kB

assets/showcase_origin/formula_2.jpg ADDED Viewed

Git LFS Details

SHA256: 322ec389bcd88e6870ffb91ccf5ca6b667b02b5c129f44e3c6e93877e7f95800
Pointer size: 131 Bytes
Size of remote file: 300 kB

assets/showcase_origin/formula_3.jpg ADDED Viewed

Git LFS Details

SHA256: e47451f351abdd184f8bda270e8fba08cb1e739157584d064d9245e4fbf29247
Pointer size: 131 Bytes
Size of remote file: 269 kB

assets/showcase_origin/kannada.jpg ADDED Viewed

Git LFS Details

SHA256: dad7aefe09cb39d7db21cd9e1c86c6fd47a2775e55b2fbe087ebdc2f44f0ab9f
Pointer size: 131 Bytes
Size of remote file: 456 kB

assets/showcase_origin/nl.png ADDED Viewed

Git LFS Details

SHA256: aabb798d409851fb0fee59f3152354827fc633c5f9103a6ae130e6849e4c6030
Pointer size: 132 Bytes
Size of remote file: 1.15 MB

assets/showcase_origin/reading_order.png ADDED Viewed

Git LFS Details

SHA256: ebf62f427254a527d917b2d7acb3e68f7a6881277ffa382192e584508a84ca91
Pointer size: 131 Bytes
Size of remote file: 689 kB

assets/showcase_origin/russian.png ADDED Viewed

Git LFS Details

SHA256: 46e1e851f18e67153291b0608563eb98095975e1a9b0e23aa7a2308e229fdf49
Pointer size: 132 Bytes
Size of remote file: 1.8 MB

assets/showcase_origin/table_1.jpg ADDED Viewed

Git LFS Details

SHA256: 90345584ccc2c4a883779e5d47693276e8cf3fe752700af4f03b3142ab46cfa2
Pointer size: 131 Bytes
Size of remote file: 773 kB

assets/showcase_origin/table_2.jpg ADDED Viewed

Git LFS Details

SHA256: 308a117b9293b92ca11f2ead9d2bca58df39c435e53d50e7a78785785041acf1
Pointer size: 131 Bytes
Size of remote file: 942 kB

assets/showcase_origin/table_3.jpg ADDED Viewed

Git LFS Details

SHA256: 4542239b141f27f85006b1ec533e671e6e338ed4e18430b5974aa7a2d1105fef
Pointer size: 132 Bytes
Size of remote file: 2.06 MB

assets/showcase_origin/tradition_zh.png ADDED Viewed

Git LFS Details

SHA256: 318d5e7b11b0569deb0021a057cd8068d5e1b16ce50dfa2e8628998b1b5a448d
Pointer size: 131 Bytes
Size of remote file: 960 kB

assets/wechat.png ADDED Viewed

Git LFS Details

SHA256: c2208f35514007740f9b1efc1f738f0735095f5d6cd79b47eb7fac63bc7a0941
Pointer size: 131 Bytes
Size of remote file: 593 kB

demo/demo_colab_remote_server.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo/demo_gradio.py ADDED Viewed

	@@ -0,0 +1,726 @@

+"""
+Layout Inference Web Application with Gradio
+A Gradio-based layout inference tool that supports image uploads and multiple backend inference engines.
+It adopts a reference-style interface design while preserving the original inference logic.
+"""
+import gradio as gr
+import json
+import os
+import io
+import tempfile
+import base64
+import zipfile
+import uuid
+import re
+from pathlib import Path
+from PIL import Image
+import requests
+import shutil # Import shutil for cleanup
+# Local tool imports
+from dots_ocr.utils import dict_promptmode_to_prompt
+from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
+from dots_ocr.utils.demo_utils.display import read_image
+from dots_ocr.utils.doc_utils import load_images_from_pdf
+# Add DotsOCRParser import
+from dots_ocr.parser import DotsOCRParser
+# ==================== Configuration ====================
+DEFAULT_CONFIG = {
+    'ip': "127.0.0.1",
+    'port_vllm': 8000,
+    'min_pixels': MIN_PIXELS,
+    'max_pixels': MAX_PIXELS,
+    'test_images_dir': "./assets/showcase_origin",
+}
+# ==================== Global Variables ====================
+# Store current configuration
+current_config = DEFAULT_CONFIG.copy()
+# Create DotsOCRParser instance
+dots_parser = DotsOCRParser(
+    ip=DEFAULT_CONFIG['ip'],
+    port=DEFAULT_CONFIG['port_vllm'],
+    dpi=200,
+    min_pixels=DEFAULT_CONFIG['min_pixels'],
+    max_pixels=DEFAULT_CONFIG['max_pixels']
+)
+def get_initial_session_state():
+    return {
+        'processing_results': {
+            'original_image': None,
+            'processed_image': None,
+            'layout_result': None,
+            'markdown_content': None,
+            'cells_data': None,
+            'temp_dir': None,
+            'session_id': None,
+            'result_paths': None,
+            'pdf_results': None
+        },
+        'pdf_cache': {
+            "images": [],
+            "current_page": 0,
+            "total_pages": 0,
+            "file_type": None,
+            "is_parsed": False,
+            "results": []
+        }
+    }
+def read_image_v2(img):
+    """Reads an image, supports URLs and local paths"""
+    if isinstance(img, str) and img.startswith(("http://", "https://")):
+        with requests.get(img, stream=True) as response:
+            response.raise_for_status()
+            img = Image.open(io.BytesIO(response.content))
+    elif isinstance(img, str):
+        img, _, _ = read_image(img, use_native=True)
+    elif isinstance(img, Image.Image):
+        pass
+    else:
+        raise ValueError(f"Invalid image type: {type(img)}")
+    return img
+def load_file_for_preview(file_path, session_state):
+    """Loads a file for preview, supports PDF and image files"""
+    pdf_cache = session_state['pdf_cache']
+    if not file_path or not os.path.exists(file_path):
+        return None, "<div id='page_info_box'>0 / 0</div>", session_state
+    file_ext = os.path.splitext(file_path)[1].lower()
+    try:
+        if file_ext == '.pdf':
+            pages = load_images_from_pdf(file_path)
+            pdf_cache["file_type"] = "pdf"
+        elif file_ext in ['.jpg', '.jpeg', '.png']:
+            image = Image.open(file_path)
+            pages = [image]
+            pdf_cache["file_type"] = "image"
+        else:
+            return None, "<div id='page_info_box'>Unsupported file format</div>", session_state
+    except Exception as e:
+        return None, f"<div id='page_info_box'>PDF loading failed: {str(e)}</div>", session_state
+    pdf_cache["images"] = pages
+    pdf_cache["current_page"] = 0
+    pdf_cache["total_pages"] = len(pages)
+    pdf_cache["is_parsed"] = False
+    pdf_cache["results"] = []
+    return pages[0], f"<div id='page_info_box'>1 / {len(pages)}</div>", session_state
+def turn_page(direction, session_state):
+    """Page turning function"""
+    pdf_cache = session_state['pdf_cache']
+    if not pdf_cache["images"]:
+        return None, "<div id='page_info_box'>0 / 0</div>", "", session_state
+    if direction == "prev":
+        pdf_cache["current_page"] = max(0, pdf_cache["current_page"] - 1)
+    elif direction == "next":
+        pdf_cache["current_page"] = min(pdf_cache["total_pages"] - 1, pdf_cache["current_page"] + 1)
+    index = pdf_cache["current_page"]
+    current_image = pdf_cache["images"][index]  # Use the original image by default
+    page_info = f"<div id='page_info_box'>{index + 1} / {pdf_cache['total_pages']}</div>"
+    current_json = ""
+    if pdf_cache["is_parsed"] and index < len(pdf_cache["results"]):
+        result = pdf_cache["results"][index]
+        if 'cells_data' in result and result['cells_data']:
+            try:
+                current_json = json.dumps(result['cells_data'], ensure_ascii=False, indent=2)
+            except:
+                current_json = str(result.get('cells_data', ''))
+        if 'layout_image' in result and result['layout_image']:
+            current_image = result['layout_image']
+    return current_image, page_info, current_json, session_state
+def get_test_images():
+    """Gets the list of test images"""
+    test_images = []
+    test_dir = current_config['test_images_dir']
+    if os.path.exists(test_dir):
+        test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)
+                      if name.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf'))]
+    return test_images
+def create_temp_session_dir():
+    """Creates a unique temporary directory for each processing request"""
+    session_id = uuid.uuid4().hex[:8]
+    temp_dir = os.path.join(tempfile.gettempdir(), f"dots_ocr_demo_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    return temp_dir, session_id
+def parse_image_with_high_level_api(parser, image, prompt_mode, fitz_preprocess=False):
+    """
+    Processes using the high-level API parse_image from DotsOCRParser
+    """
+    # Create a temporary session directory
+    temp_dir, session_id = create_temp_session_dir()
+    try:
+        # Save the PIL Image as a temporary file
+        temp_image_path = os.path.join(temp_dir, f"input_{session_id}.png")
+        image.save(temp_image_path, "PNG")
+        # Use the high-level API parse_image
+        filename = f"demo_{session_id}"
+        results = parser.parse_image(
+            input_path=image,
+            filename=filename,
+            prompt_mode=prompt_mode,
+            save_dir=temp_dir,
+            fitz_preprocess=fitz_preprocess
+        )
+        # Parse the results
+        if not results:
+            raise ValueError("No results returned from parser")
+        result = results[0]  # parse_image returns a list with a single result
+        layout_image = None
+        if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
+            layout_image = Image.open(result['layout_image_path'])
+        cells_data = None
+        if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
+            with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
+                cells_data = json.load(f)
+        md_content = None
+        if 'md_content_path' in result and os.path.exists(result['md_content_path']):
+            with open(result['md_content_path'], 'r', encoding='utf-8') as f:
+                md_content = f.read()
+        return {
+            'layout_image': layout_image,
+            'cells_data': cells_data,
+            'md_content': md_content,
+            'filtered': result.get('filtered', False),
+            'temp_dir': temp_dir,
+            'session_id': session_id,
+            'result_paths': result,
+            'input_width': result.get('input_width', 0),
+            'input_height': result.get('input_height', 0),
+        }
+    except Exception as e:
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        raise e
+def parse_pdf_with_high_level_api(parser, pdf_path, prompt_mode):
+    """
+    Processes using the high-level API parse_pdf from DotsOCRParser
+    """
+    # Create a temporary session directory
+    temp_dir, session_id = create_temp_session_dir()
+    try:
+        # Use the high-level API parse_pdf
+        filename = f"demo_{session_id}"
+        results = parser.parse_pdf(
+            input_path=pdf_path,
+            filename=filename,
+            prompt_mode=prompt_mode,
+            save_dir=temp_dir
+        )
+        # Parse the results
+        if not results:
+            raise ValueError("No results returned from parser")
+        # Handle multi-page results
+        parsed_results = []
+        all_md_content = []
+        all_cells_data = []
+        for i, result in enumerate(results):
+            page_result = {
+                'page_no': result.get('page_no', i),
+                'layout_image': None,
+                'cells_data': None,
+                'md_content': None,
+                'filtered': False
+            }
+            # Read the layout image
+            if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
+                page_result['layout_image'] = Image.open(result['layout_image_path'])
+            # Read the JSON data
+            if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
+                with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
+                    page_result['cells_data'] = json.load(f)
+                    all_cells_data.extend(page_result['cells_data'])
+            # Read the Markdown content
+            if 'md_content_path' in result and os.path.exists(result['md_content_path']):
+                with open(result['md_content_path'], 'r', encoding='utf-8') as f:
+                    page_content = f.read()
+                    page_result['md_content'] = page_content
+                    all_md_content.append(page_content)
+            page_result['filtered'] = result.get('filtered', False)
+            parsed_results.append(page_result)
+        combined_md = "\n\n---\n\n".join(all_md_content) if all_md_content else ""
+        return {
+            'parsed_results': parsed_results,
+            'combined_md_content': combined_md,
+            'combined_cells_data': all_cells_data,
+            'temp_dir': temp_dir,
+            'session_id': session_id,
+            'total_pages': len(results)
+        }
+    except Exception as e:
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        raise e
+# ==================== Core Processing Function ====================
+def process_image_inference(session_state, test_image_input, file_input,
+                          prompt_mode, server_ip, server_port, min_pixels, max_pixels,
+                          fitz_preprocess=False
+                          ):
+    """Core function to handle image/PDF inference"""
+    # Use session_state instead of global variables
+    processing_results = session_state['processing_results']
+    pdf_cache = session_state['pdf_cache']
+    if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
+        try:
+            shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
+        except Exception as e:
+            print(f"Failed to clean up previous temporary directory: {e}")
+    # Reset processing results for the current session
+    session_state['processing_results'] = get_initial_session_state()['processing_results']
+    processing_results = session_state['processing_results']
+    current_config.update({
+        'ip': server_ip,
+        'port_vllm': server_port,
+        'min_pixels': min_pixels,
+        'max_pixels': max_pixels
+    })
+    # Update parser configuration
+    dots_parser.ip = server_ip
+    dots_parser.port = server_port
+    dots_parser.min_pixels = min_pixels
+    dots_parser.max_pixels = max_pixels
+    input_file_path = file_input if file_input else test_image_input
+    if not input_file_path:
+        return None, "Please upload image/PDF file or select test image", "", "", gr.update(value=None), None, "", session_state
+    file_ext = os.path.splitext(input_file_path)[1].lower()
+    try:
+        if file_ext == '.pdf':
+            # MINIMAL CHANGE: The `process_pdf_file` function is now inlined and uses session_state.
+            preview_image, page_info, session_state = load_file_for_preview(input_file_path, session_state)
+            pdf_result = parse_pdf_with_high_level_api(dots_parser, input_file_path, prompt_mode)
+            session_state['pdf_cache']["is_parsed"] = True
+            session_state['pdf_cache']["results"] = pdf_result['parsed_results']
+            processing_results.update({
+                'markdown_content': pdf_result['combined_md_content'],
+                'cells_data': pdf_result['combined_cells_data'],
+                'temp_dir': pdf_result['temp_dir'],
+                'session_id': pdf_result['session_id'],
+                'pdf_results': pdf_result['parsed_results']
+            })
+            total_elements = len(pdf_result['combined_cells_data'])
+            info_text = f"**PDF Information:**\n- Total Pages: {pdf_result['total_pages']}\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Total Detected Elements: {total_elements}\n- Session ID: {pdf_result['session_id']}"
+            current_page_layout_image = preview_image
+            current_page_json = ""
+            if session_state['pdf_cache']["results"]:
+                first_result = session_state['pdf_cache']["results"][0]
+                if 'layout_image' in first_result and first_result['layout_image']:
+                    current_page_layout_image = first_result['layout_image']
+                if first_result.get('cells_data'):
+                    try:
+                        current_page_json = json.dumps(first_result['cells_data'], ensure_ascii=False, indent=2)
+                    except:
+                        current_page_json = str(first_result['cells_data'])
+            download_zip_path = None
+            if pdf_result['temp_dir']:
+                download_zip_path = os.path.join(pdf_result['temp_dir'], f"layout_results_{pdf_result['session_id']}.zip")
+                with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for root, _, files in os.walk(pdf_result['temp_dir']):
+                        for file in files:
+                            if not file.endswith('.zip'): zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), pdf_result['temp_dir']))
+            return (
+                current_page_layout_image, info_text, pdf_result['combined_md_content'] or "No markdown content generated",
+                pdf_result['combined_md_content'] or "No markdown content generated",
+                gr.update(value=download_zip_path, visible=bool(download_zip_path)), page_info, current_page_json, session_state
+            )
+        else: # Image processing
+            image = read_image_v2(input_file_path)
+            session_state['pdf_cache'] = get_initial_session_state()['pdf_cache']
+            original_image = image
+            parse_result = parse_image_with_high_level_api(dots_parser, image, prompt_mode, fitz_preprocess)
+            if parse_result['filtered']:
+                 info_text = f"**Image Information:**\n- Original Size: {original_image.width} x {original_image.height}\n- Processing: JSON parsing failed, using cleaned text output\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Session ID: {parse_result['session_id']}"
+                 processing_results.update({
+                     'original_image': original_image, 'markdown_content': parse_result['md_content'],
+                     'temp_dir': parse_result['temp_dir'], 'session_id': parse_result['session_id'],
+                     'result_paths': parse_result['result_paths']
+                 })
+                 return original_image, info_text, parse_result['md_content'], parse_result['md_content'], gr.update(visible=False), None, "", session_state
+            md_content_raw = parse_result['md_content'] or "No markdown content generated"
+            processing_results.update({
+                'original_image': original_image, 'layout_result': parse_result['layout_image'],
+                'markdown_content': parse_result['md_content'], 'cells_data': parse_result['cells_data'],
+                'temp_dir': parse_result['temp_dir'], 'session_id': parse_result['session_id'],
+                'result_paths': parse_result['result_paths']
+            })
+            num_elements = len(parse_result['cells_data']) if parse_result['cells_data'] else 0
+            info_text = f"**Image Information:**\n- Original Size: {original_image.width} x {original_image.height}\n- Model Input Size: {parse_result['input_width']} x {parse_result['input_height']}\n- Server: {current_config['ip']}:{current_config['port_vllm']}\n- Detected {num_elements} layout elements\n- Session ID: {parse_result['session_id']}"
+            current_json = json.dumps(parse_result['cells_data'], ensure_ascii=False, indent=2) if parse_result['cells_data'] else ""
+            download_zip_path = None
+            if parse_result['temp_dir']:
+                download_zip_path = os.path.join(parse_result['temp_dir'], f"layout_results_{parse_result['session_id']}.zip")
+                with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for root, _, files in os.walk(parse_result['temp_dir']):
+                        for file in files:
+                            if not file.endswith('.zip'): zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), parse_result['temp_dir']))
+            return (
+                parse_result['layout_image'], info_text, parse_result['md_content'] or "No markdown content generated",
+                md_content_raw, gr.update(value=download_zip_path, visible=bool(download_zip_path)),
+                None, current_json, session_state
+            )
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error during processing: {e}", "", "", gr.update(value=None), None, "", session_state
+# MINIMAL CHANGE: Functions now take `session_state` as an argument.
+def clear_all_data(session_state):
+    """Clears all data"""
+    processing_results = session_state['processing_results']
+    if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
+        try:
+            shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
+        except Exception as e:
+            print(f"Failed to clean up temporary directory: {e}")
+    # Reset the session state by returning a new initial state
+    new_session_state = get_initial_session_state()
+    return (
+        None,  # Clear file input
+        "",    # Clear test image selection
+        None,  # Clear result image
+        "Waiting for processing results...",  # Reset info display
+        "## Waiting for processing results...",  # Reset Markdown display
+        "🕐 Waiting for parsing result...",    # Clear raw Markdown text
+        gr.update(visible=False),  # Hide download button
+        "<div id='page_info_box'>0 / 0</div>",  # Reset page info
+        "🕐 Waiting for parsing result...",     # Clear current page JSON
+        new_session_state
+    )
+def update_prompt_display(prompt_mode):
+    """Updates the prompt display content"""
+    return dict_promptmode_to_prompt[prompt_mode]
+# ==================== Gradio Interface ====================
+def create_gradio_interface():
+    """Creates the Gradio interface"""
+    # CSS styles, matching the reference style
+    css = """
+    #parse_button {
+        background: #FF576D !important; /* !important 确保覆盖主题默认样式 */
+        border-color: #FF576D !important;
+    }
+    /* 鼠标悬停时的颜色 */
+    #parse_button:hover {
+        background: #F72C49 !important;
+        border-color: #F72C49 !important;
+    }
+    #page_info_html {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        height: 100%;
+        margin: 0 12px;
+    }
+    #page_info_box {
+        padding: 8px 20px;
+        font-size: 16px;
+        border: 1px solid #bbb;
+        border-radius: 8px;
+        background-color: #f8f8f8;
+        text-align: center;
+        min-width: 80px;
+        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+    }
+    #markdown_output {
+        min-height: 800px;
+        overflow: auto;
+    }
+    footer {
+        visibility: hidden;
+    }
+    #info_box {
+        padding: 10px;
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        border: 1px solid #dee2e6;
+        margin: 10px 0;
+        font-size: 14px;
+    }
+    #result_image {
+        border-radius: 8px;
+    }
+    #markdown_tabs {
+        height: 100%;
+    }
+    """
+    with gr.Blocks(theme="ocean", css=css, title='dots.ocr') as demo:
+        session_state = gr.State(value=get_initial_session_state())
+        # Title
+        gr.HTML("""
+            <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
+                <h1 style="margin: 0; font-size: 2em;">🔍 dots.ocr</h1>
+            </div>
+            <div style="text-align: center; margin-bottom: 10px;">
+                <em>Supports image/PDF layout analysis and structured output</em>
+            </div>
+        """)
+        with gr.Row():
+            # Left side: Input and Configuration
+            with gr.Column(scale=1, elem_id="left-panel"):
+                gr.Markdown("### 📥 Upload & Select")
+                file_input = gr.File(
+                    label="Upload PDF/Image",
+                    type="filepath",
+                    file_types=[".pdf", ".jpg", ".jpeg", ".png"],
+                )
+                test_images = get_test_images()
+                test_image_input = gr.Dropdown(
+                    label="Or Select an Example",
+                    choices=[""] + test_images,
+                    value="",
+                )
+                gr.Markdown("### ⚙️ Prompt & Actions")
+                prompt_mode = gr.Dropdown(
+                    label="Select Prompt",
+                    choices=["prompt_layout_all_en", "prompt_layout_only_en", "prompt_ocr"],
+                    value="prompt_layout_all_en",
+                )
+                # Display current prompt content
+                prompt_display = gr.Textbox(
+                    label="Current Prompt Content",
+                    value=dict_promptmode_to_prompt[list(dict_promptmode_to_prompt.keys())[0]],
+                    lines=4,
+                    max_lines=8,
+                    interactive=False,
+                    show_copy_button=True
+                )
+                with gr.Row():
+                    process_btn = gr.Button("🔍 Parse", variant="primary", scale=2, elem_id="parse_button")
+                    clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
+                with gr.Accordion("🛠️ Advanced Configuration", open=False):
+                    fitz_preprocess = gr.Checkbox(
+                        label="Enable fitz_preprocess for images",
+                        value=True,
+                        info="Processes image via a PDF-like pipeline (image->pdf->200dpi image). Recommended if your image DPI is low."
+                    )
+                    with gr.Row():
+                        server_ip = gr.Textbox(label="Server IP", value=DEFAULT_CONFIG['ip'])
+                        server_port = gr.Number(label="Port", value=DEFAULT_CONFIG['port_vllm'], precision=0)
+                    with gr.Row():
+                        min_pixels = gr.Number(label="Min Pixels", value=DEFAULT_CONFIG['min_pixels'], precision=0)
+                        max_pixels = gr.Number(label="Max Pixels", value=DEFAULT_CONFIG['max_pixels'], precision=0)
+            # Right side: Result Display
+            with gr.Column(scale=6, variant="compact"):
+                with gr.Row():
+                    # Result Image
+                    with gr.Column(scale=3):
+                        gr.Markdown("### 👁️ File Preview")
+                        result_image = gr.Image(
+                            label="Layout Preview",
+                            visible=True,
+                            height=800,
+                            show_label=False
+                        )
+                        # Page navigation (shown during PDF preview)
+                        with gr.Row():
+                            prev_btn = gr.Button("⬅ Previous", size="sm")
+                            page_info = gr.HTML(
+                                value="<div id='page_info_box'>0 / 0</div>",
+                                elem_id="page_info_html"
+                            )
+                            next_btn = gr.Button("Next ➡", size="sm")
+                        # Info Display
+                        info_display = gr.Markdown(
+                            "Waiting for processing results...",
+                            elem_id="info_box"
+                        )
+                    # Markdown Result
+                    with gr.Column(scale=3):
+                        gr.Markdown("### ✔️ Result Display")
+                        with gr.Tabs(elem_id="markdown_tabs"):
+                            with gr.TabItem("Markdown Render Preview"):
+                                md_output = gr.Markdown(
+                                    "## Please click the parse button to parse or select for single-task recognition...",
+                                    max_height=600,
+                                    latex_delimiters=[
+                                        {"left": "$$", "right": "$$", "display": True},
+                                        {"left": "$", "right": "$", "display": False}
+                                    ],
+                                    show_copy_button=False,
+                                    elem_id="markdown_output"
+                                )
+                            with gr.TabItem("Markdown Raw Text"):
+                                md_raw_output = gr.Textbox(
+                                    value="🕐 Waiting for parsing result...",
+                                    label="Markdown Raw Text",
+                                    max_lines=100,
+                                    lines=38,
+                                    show_copy_button=True,
+                                    elem_id="markdown_output",
+                                    show_label=False
+                                )
+                            with gr.TabItem("Current Page JSON"):
+                                current_page_json = gr.Textbox(
+                                    value="🕐 Waiting for parsing result...",
+                                    label="Current Page JSON",
+                                    max_lines=100,
+                                    lines=38,
+                                    show_copy_button=True,
+                                    elem_id="markdown_output",
+                                    show_label=False
+                                )
+                # Download Button
+                with gr.Row():
+                    download_btn = gr.DownloadButton(
+                        "⬇️ Download Results",
+                        visible=False
+                    )
+        # When the prompt mode changes, update the display content
+        prompt_mode.change(
+            fn=update_prompt_display,
+            inputs=prompt_mode,
+            outputs=prompt_display,
+        )
+        # Show preview on file upload
+        file_input.upload(
+            # fn=lambda file_data, state: load_file_for_preview(file_data, state),
+            fn=load_file_for_preview,
+            inputs=[file_input, session_state],
+            outputs=[result_image, page_info, session_state]
+        )
+        # Also handle test image selection
+        test_image_input.change(
+            # fn=lambda path, state: load_file_for_preview(path, state),
+            fn=load_file_for_preview,
+            inputs=[test_image_input, session_state],
+            outputs=[result_image, page_info, session_state]
+        )
+        prev_btn.click(
+            fn=lambda s: turn_page("prev", s),
+            inputs=[session_state],
+            outputs=[result_image, page_info, current_page_json, session_state]
+        )
+        next_btn.click(
+            fn=lambda s: turn_page("next", s),
+            inputs=[session_state],
+            outputs=[result_image, page_info, current_page_json, session_state]
+        )
+        process_btn.click(
+            fn=process_image_inference,
+            inputs=[
+                session_state, test_image_input, file_input,
+                prompt_mode, server_ip, server_port, min_pixels, max_pixels,
+                fitz_preprocess
+            ],
+            outputs=[
+                result_image, info_display, md_output, md_raw_output,
+                download_btn, page_info, current_page_json, session_state
+            ]
+        )
+        clear_btn.click(
+            fn=clear_all_data,
+            inputs=[session_state],
+            outputs=[
+                file_input, test_image_input,
+                result_image, info_display, md_output, md_raw_output,
+                download_btn, page_info, current_page_json, session_state
+            ]
+        )
+    return demo
+# ==================== Main Program ====================
+if __name__ == "__main__":
+    import sys
+    port = int(sys.argv[1])
+    demo = create_gradio_interface()
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        debug=True
+    )

demo/demo_gradio_annotion.py ADDED Viewed

	@@ -0,0 +1,666 @@

+"""
+Layout Inference Web Application with Gradio - Annotation Version
+A Gradio-based layout inference tool that supports image uploads and multiple backend inference engines.
+This version adds an image annotation feature, allowing users to draw bounding boxes on an image and send both the image and the boxes to the model.
+"""
+import gradio as gr
+import json
+import os
+import io
+import tempfile
+import base64
+import zipfile
+import uuid
+import re
+from pathlib import Path
+from PIL import Image
+import requests
+from gradio_image_annotation import image_annotator
+# Local utility imports
+from dots_ocr.utils import dict_promptmode_to_prompt
+from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
+from dots_ocr.utils.demo_utils.display import read_image
+from dots_ocr.utils.doc_utils import load_images_from_pdf
+# Add DotsOCRParser import
+from dots_ocr.parser import DotsOCRParser
+# ==================== Configuration ====================
+DEFAULT_CONFIG = {
+    'ip': "127.0.0.1",
+    'port_vllm': 8000,
+    'min_pixels': MIN_PIXELS,
+    'max_pixels': MAX_PIXELS,
+    'test_images_dir': "./assets/showcase_origin",
+}
+# ==================== Global Variables ====================
+# Store the current configuration
+current_config = DEFAULT_CONFIG.copy()
+# Create a DotsOCRParser instance
+dots_parser = DotsOCRParser(
+    ip=DEFAULT_CONFIG['ip'],
+    port=DEFAULT_CONFIG['port_vllm'],
+    dpi=200,
+    min_pixels=DEFAULT_CONFIG['min_pixels'],
+    max_pixels=DEFAULT_CONFIG['max_pixels']
+)
+# Store processing results
+processing_results = {
+    'original_image': None,
+    'processed_image': None,
+    'layout_result': None,
+    'markdown_content': None,
+    'cells_data': None,
+    'temp_dir': None,
+    'session_id': None,
+    'result_paths': None,
+    'annotation_data': None  # Store annotation data
+}
+# ==================== Utility Functions ====================
+def read_image_v2(img):
+    """Reads an image, supporting URLs and local paths."""
+    if isinstance(img, str) and img.startswith(("http://", "https://")):
+        with requests.get(img, stream=True) as response:
+            response.raise_for_status()
+            img = Image.open(io.BytesIO(response.content))
+    elif isinstance(img, str):
+        img, _, _ = read_image(img, use_native=True)
+    elif isinstance(img, Image.Image):
+        pass
+    else:
+        raise ValueError(f"Invalid image type: {type(img)}")
+    return img
+def get_test_images():
+    """Gets the list of test images."""
+    test_images = []
+    test_dir = current_config['test_images_dir']
+    if os.path.exists(test_dir):
+        test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)
+                      if name.lower().endswith(('.png', '.jpg', '.jpeg'))]
+    return test_images
+def create_temp_session_dir():
+    """Creates a unique temporary directory for each processing request."""
+    session_id = uuid.uuid4().hex[:8]
+    temp_dir = os.path.join(tempfile.gettempdir(), f"dots_ocr_demo_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    return temp_dir, session_id
+def parse_image_with_bbox(parser, image, prompt_mode, bbox=None, fitz_preprocess=False):
+    """
+    Processes an image using DotsOCRParser, with support for the bbox parameter.
+    """
+    # Create a temporary session directory
+    temp_dir, session_id = create_temp_session_dir()
+    try:
+        # Save the PIL Image to a temporary file
+        temp_image_path = os.path.join(temp_dir, f"input_{session_id}.png")
+        image.save(temp_image_path, "PNG")
+        # Use the high-level parse_image interface, passing the bbox parameter
+        filename = f"demo_{session_id}"
+        results = parser.parse_image(
+            input_path=temp_image_path,
+            filename=filename,
+            prompt_mode=prompt_mode,
+            save_dir=temp_dir,
+            bbox=bbox,
+            fitz_preprocess=fitz_preprocess
+        )
+        # Parse the results
+        if not results:
+            raise ValueError("No results returned from parser")
+        result = results[0]  # parse_image returns a list with a single result
+        # Read the result files
+        layout_image = None
+        cells_data = None
+        md_content = None
+        filtered = False
+        # Read the layout image
+        if 'layout_image_path' in result and os.path.exists(result['layout_image_path']):
+            layout_image = Image.open(result['layout_image_path'])
+        # Read the JSON data
+        if 'layout_info_path' in result and os.path.exists(result['layout_info_path']):
+            with open(result['layout_info_path'], 'r', encoding='utf-8') as f:
+                cells_data = json.load(f)
+        # Read the Markdown content
+        if 'md_content_path' in result and os.path.exists(result['md_content_path']):
+            with open(result['md_content_path'], 'r', encoding='utf-8') as f:
+                md_content = f.read()
+        # Check for the original response file (if JSON parsing fails)
+        if 'filtered' in result:
+            filtered = result['filtered']
+        return {
+            'layout_image': layout_image,
+            'cells_data': cells_data,
+            'md_content': md_content,
+            'filtered': filtered,
+            'temp_dir': temp_dir,
+            'session_id': session_id,
+            'result_paths': result
+        }
+    except Exception as e:
+        # Clean up the temporary directory on error
+        import shutil
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir, ignore_errors=True)
+        raise e
+def process_annotation_data(annotation_data):
+    """Processes annotation data, converting it to the format required by the model."""
+    if not annotation_data or not annotation_data.get('boxes'):
+        return None, None
+    # Get image and box data
+    image = annotation_data.get('image')
+    boxes = annotation_data.get('boxes', [])
+    if not boxes:
+        return image, None
+    # Ensure the image is in PIL Image format
+    if image is not None:
+        import numpy as np
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        elif not isinstance(image, Image.Image):
+            # If it's another format, try to convert it
+            try:
+                image = Image.open(image) if isinstance(image, str) else Image.fromarray(image)
+            except Exception as e:
+                print(f"Image format conversion failed: {e}")
+                return None, None
+    # Get the coordinate information of the box (only one box)
+    box = boxes[0]
+    bbox = [box['xmin'], box['ymin'], box['xmax'], box['ymax']]
+    return image, bbox
+# ==================== Core Processing Function ====================
+def process_image_inference_with_annotation(annotation_data, test_image_input,
+                          prompt_mode, server_ip, server_port, min_pixels, max_pixels,
+                          fitz_preprocess=False
+                          ):
+    """Core function for image inference, supporting annotation data."""
+    global current_config, processing_results, dots_parser
+    # First, clean up previous processing results
+    if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
+        import shutil
+        try:
+            shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
+        except Exception as e:
+            print(f"Failed to clean up previous temporary directory: {e}")
+    # Reset processing results
+    processing_results = {
+        'original_image': None,
+        'processed_image': None,
+        'layout_result': None,
+        'markdown_content': None,
+        'cells_data': None,
+        'temp_dir': None,
+        'session_id': None,
+        'result_paths': None,
+        'annotation_data': annotation_data
+    }
+    # Update configuration
+    current_config.update({
+        'ip': server_ip,
+        'port_vllm': server_port,
+        'min_pixels': min_pixels,
+        'max_pixels': max_pixels
+    })
+    # Update parser configuration
+    dots_parser.ip = server_ip
+    dots_parser.port = server_port
+    dots_parser.min_pixels = min_pixels
+    dots_parser.max_pixels = max_pixels
+    # Determine the input source and process annotation data
+    image = None
+    bbox = None
+    # Prioritize processing annotation data
+    if annotation_data and annotation_data.get('image') is not None:
+        image, bbox = process_annotation_data(annotation_data)
+        if image is not None:
+            # If there's a bbox, force the use of 'prompt_grounding_ocr' mode
+            assert bbox is not None
+            prompt_mode = "prompt_grounding_ocr"
+    # If there's no annotation data, check the test image input
+    if image is None and test_image_input and test_image_input != "":
+        try:
+            image = read_image_v2(test_image_input)
+        except Exception as e:
+            return None, f"Failed to read test image: {e}", "", "", gr.update(value=None), ""
+    if image is None:
+        return None, "Please select a test image or add an image in the annotation component", "", "", gr.update(value=None), ""
+    if bbox is None:
+        return "Please select a bounding box by mouse", "Please select a bounding box by mouse", "", "", gr.update(value=None)
+    try:
+        # Process using DotsOCRParser, passing the bbox parameter
+        original_image = image
+        parse_result = parse_image_with_bbox(dots_parser, image, prompt_mode, bbox, fitz_preprocess)
+        # Extract parsing results
+        layout_image = parse_result['layout_image']
+        cells_data = parse_result['cells_data']
+        md_content = parse_result['md_content']
+        filtered = parse_result['filtered']
+        # Store the results
+        processing_results.update({
+            'original_image': original_image,
+            'processed_image': None,
+            'layout_result': layout_image,
+            'markdown_content': md_content,
+            'cells_data': cells_data,
+            'temp_dir': parse_result['temp_dir'],
+            'session_id': parse_result['session_id'],
+            'result_paths': parse_result['result_paths'],
+            'annotation_data': annotation_data
+        })
+        # Handle the case where parsing fails
+        if filtered:
+            info_text = f"""
+**Image Information:**
+- Original Dimensions: {original_image.width} x {original_image.height}
+- Processing Mode: {'Region OCR' if bbox else 'Full Image OCR'}
+- Processing Status: JSON parsing failed, using cleaned text output
+- Server: {current_config['ip']}:{current_config['port_vllm']}
+- Session ID: {parse_result['session_id']}
+- Box Coordinates: {bbox if bbox else 'None'}
+            """
+            return (
+                md_content or "No markdown content generated",
+                info_text,
+                md_content or "No markdown content generated",
+                md_content or "No markdown content generated",
+                gr.update(visible=False),
+                ""
+            )
+        # Handle the case where JSON parsing succeeds
+        num_elements = len(cells_data) if cells_data else 0
+        info_text = f"""
+**Image Information:**
+- Original Dimensions: {original_image.width} x {original_image.height}
+- Processing Mode: {'Region OCR' if bbox else 'Full Image OCR'}
+- Server: {current_config['ip']}:{current_config['port_vllm']}
+- Detected {num_elements} layout elements
+- Session ID: {parse_result['session_id']}
+- Box Coordinates: {bbox if bbox else 'None'}
+        """
+        # Current page JSON output
+        current_json = ""
+        if cells_data:
+            try:
+                current_json = json.dumps(cells_data, ensure_ascii=False, indent=2)
+            except:
+                current_json = str(cells_data)
+        # Create a downloadable ZIP file
+        download_zip_path = None
+        if parse_result['temp_dir']:
+            download_zip_path = os.path.join(parse_result['temp_dir'], f"layout_results_{parse_result['session_id']}.zip")
+            try:
+                with zipfile.ZipFile(download_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    for root, dirs, files in os.walk(parse_result['temp_dir']):
+                        for file in files:
+                            if file.endswith('.zip'):
+                                continue
+                            file_path = os.path.join(root, file)
+                            arcname = os.path.relpath(file_path, parse_result['temp_dir'])
+                            zipf.write(file_path, arcname)
+            except Exception as e:
+                print(f"Failed to create download ZIP: {e}")
+                download_zip_path = None
+        return (
+            md_content or "No markdown content generated",
+            info_text,
+            md_content or "No markdown content generated",
+            md_content or "No markdown content generated",
+            gr.update(value=download_zip_path, visible=True) if download_zip_path else gr.update(visible=False),
+            current_json
+        )
+    except Exception as e:
+        return f"An error occurred during processing: {e}", f"An error occurred during processing: {e}", "", "", gr.update(value=None), ""
+def load_image_to_annotator(test_image_input):
+    """Loads an image into the annotation component."""
+    image = None
+    # Check the test image input
+    if test_image_input and test_image_input != "":
+        try:
+            image = read_image_v2(test_image_input)
+        except Exception as e:
+            return None
+    if image is None:
+        return None
+    # Return the format required by the annotation component
+    return {
+        "image": image,
+        "boxes": []
+    }
+def clear_all_data():
+    """Clears all data."""
+    global processing_results
+    # Clean up the temporary directory
+    if processing_results.get('temp_dir') and os.path.exists(processing_results['temp_dir']):
+        import shutil
+        try:
+            shutil.rmtree(processing_results['temp_dir'], ignore_errors=True)
+        except Exception as e:
+            print(f"Failed to clean up temporary directory: {e}")
+    # Reset processing results
+    processing_results = {
+        'original_image': None,
+        'processed_image': None,
+        'layout_result': None,
+        'markdown_content': None,
+        'cells_data': None,
+        'temp_dir': None,
+        'session_id': None,
+        'result_paths': None,
+        'annotation_data': None
+    }
+    return (
+        "",    # Clear test image selection
+        None,  # Clear annotation component
+        "Waiting for processing results...",  # Reset info display
+        "## Waiting for processing results...",  # Reset Markdown display
+        "🕐 Waiting for parsing results...",    # Clear raw Markdown text
+        gr.update(visible=False),  # Hide download button
+        "🕐 Waiting for parsing results..."     # Clear JSON
+    )
+def update_prompt_display(prompt_mode):
+    """Updates the displayed prompt content."""
+    return dict_promptmode_to_prompt[prompt_mode]
+# ==================== Gradio Interface ====================
+def create_gradio_interface():
+    """Creates the Gradio interface."""
+    # CSS styling to match the reference style
+    css = """
+    footer {
+        visibility: hidden;
+    }
+    #info_box {
+        padding: 10px;
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        border: 1px solid #dee2e6;
+        margin: 10px 0;
+        font-size: 14px;
+    }
+    #markdown_tabs {
+        height: 100%;
+    }
+    #annotation_component {
+        border-radius: 8px;
+    }
+    """
+    with gr.Blocks(theme="ocean", css=css, title='dots.ocr - Annotation') as demo:
+        # Title
+        gr.HTML("""
+            <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 20px;">
+                <h1 style="margin: 0; font-size: 2em;">🔍 dots.ocr - Annotation Version</h1>
+            </div>
+            <div style="text-align: center; margin-bottom: 10px;">
+                <em>Supports image annotation, drawing boxes, and sending box information to the model for OCR.</em>
+            </div>
+        """)
+        with gr.Row():
+            # Left side: Input and Configuration
+            with gr.Column(scale=1, variant="compact"):
+                gr.Markdown("### 📁 Select Example")
+                test_images = get_test_images()
+                test_image_input = gr.Dropdown(
+                    label="Select Example",
+                    choices=[""] + test_images,
+                    value="",
+                    show_label=True
+                )
+                # Button to load image into the annotation component
+                load_btn = gr.Button("📷 Load Image to Annotation Area", variant="secondary")
+                prompt_mode = gr.Dropdown(
+                    label="Select Prompt",
+                    # choices=["prompt_layout_all_en", "prompt_layout_only_en", "prompt_ocr", "prompt_grounding_ocr"],
+                    choices=["prompt_grounding_ocr"],
+                    value="prompt_grounding_ocr",
+                    show_label=True,
+                    info="If a box is drawn, 'prompt_grounding_ocr' mode will be used automatically."
+                )
+                # Display the current prompt content
+                prompt_display = gr.Textbox(
+                    label="Current Prompt Content",
+                    # value=dict_promptmode_to_prompt[list(dict_promptmode_to_prompt.keys())[0]],
+                    value=dict_promptmode_to_prompt["prompt_grounding_ocr"],
+                    lines=4,
+                    max_lines=8,
+                    interactive=False,
+                    show_copy_button=True
+                )
+                gr.Markdown("### ⚙️ Actions")
+                process_btn = gr.Button("🔍 Parse", variant="primary")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                gr.Markdown("### 🛠️ Configuration")
+                fitz_preprocess = gr.Checkbox(
+                    label="Enable fitz_preprocess",
+                    value=False,
+                    info="Performs fitz preprocessing on the image input, converting the image to a PDF and then to a 200dpi image."
+                )
+                with gr.Row():
+                    server_ip = gr.Textbox(
+                        label="Server IP",
+                        value=DEFAULT_CONFIG['ip']
+                    )
+                    server_port = gr.Number(
+                        label="Port",
+                        value=DEFAULT_CONFIG['port_vllm'],
+                        precision=0
+                    )
+                with gr.Row():
+                    min_pixels = gr.Number(
+                        label="Min Pixels",
+                        value=DEFAULT_CONFIG['min_pixels'],
+                        precision=0
+                    )
+                    max_pixels = gr.Number(
+                        label="Max Pixels",
+                        value=DEFAULT_CONFIG['max_pixels'],
+                        precision=0
+                    )
+            # Right side: Result Display
+            with gr.Column(scale=6, variant="compact"):
+                with gr.Row():
+                    # Image Annotation Area
+                    with gr.Column(scale=3):
+                        gr.Markdown("### 🎯 Image Annotation Area")
+                        gr.Markdown("""
+                        **Instructions:**
+                        - Method 1: Select an example image on the left and click "Load Image to Annotation Area".
+                        - Method 2: Upload an image directly in the annotation area below (drag and drop or click to upload).
+                        - Use the mouse to draw a box on the image to select the region for recognition.
+                        - Only one box can be drawn. To draw a new one, please delete the old one first.
+                        - **Hotkey: Press the Delete key to remove the selected box.**
+                        - After drawing a box, clicking Parse will automatically use the Region OCR mode.
+                        """)
+                        annotator = image_annotator(
+                            value=None,
+                            label="Image Annotation",
+                            height=600,
+                            show_label=False,
+                            elem_id="annotation_component",
+                            single_box=True,  # Only allow one box; a new box will replace the old one
+                            box_min_size=10,
+                            interactive=True,
+                            disable_edit_boxes=True,  # Disable the edit dialog
+                            label_list=["OCR Region"],  # Set the default label
+                            label_colors=[(255, 0, 0)],  # Set color to red
+                            use_default_label=True,  # Use the default label
+                            image_type="pil"  # Ensure it returns a PIL Image format
+                        )
+                        # Information Display
+                        info_display = gr.Markdown(
+                            "Waiting for processing results...",
+                            elem_id="info_box"
+                        )
+                    # Result Display Area
+                    with gr.Column(scale=3):
+                        gr.Markdown("### ✅ Results")
+                        with gr.Tabs(elem_id="markdown_tabs"):
+                            with gr.TabItem("Markdown Rendered View"):
+                                md_output = gr.Markdown(
+                                    "## Please upload an image and click the Parse button for recognition...",
+                                    label="Markdown Preview",
+                                    max_height=1000,
+                                    latex_delimiters=[
+                                        {"left": "$$", "right": "$$", "display": True},
+                                        {"left": "$", "right": "$", "display": False},
+                                    ],
+                                    show_copy_button=False,
+                                    elem_id="markdown_output"
+                                )
+                            with gr.TabItem("Markdown Raw Text"):
+                                md_raw_output = gr.Textbox(
+                                    value="🕐 Waiting for parsing results...",
+                                    label="Markdown Raw Text",
+                                    max_lines=100,
+                                    lines=38,
+                                    show_copy_button=True,
+                                    elem_id="markdown_output",
+                                    show_label=False
+                                )
+                            with gr.TabItem("JSON Result"):
+                                json_output = gr.Textbox(
+                                    value="🕐 Waiting for parsing results...",
+                                    label="JSON Result",
+                                    max_lines=100,
+                                    lines=38,
+                                    show_copy_button=True,
+                                    elem_id="markdown_output",
+                                    show_label=False
+                                )
+                # Download Button
+                with gr.Row():
+                    download_btn = gr.DownloadButton(
+                        "⬇️ Download Results",
+                        visible=False
+                    )
+        # Event Binding
+        # When the prompt mode changes, update the displayed content
+        prompt_mode.change(
+            fn=update_prompt_display,
+            inputs=prompt_mode,
+            outputs=prompt_display,
+            show_progress=False
+        )
+        # Load image into the annotation component
+        load_btn.click(
+            fn=load_image_to_annotator,
+            inputs=[test_image_input],
+            outputs=annotator,
+            show_progress=False
+        )
+        # Process Inference
+        process_btn.click(
+            fn=process_image_inference_with_annotation,
+            inputs=[
+                annotator, test_image_input,
+                prompt_mode, server_ip, server_port, min_pixels, max_pixels,
+                fitz_preprocess
+            ],
+            outputs=[
+                md_output, info_display, md_raw_output, md_raw_output,
+                download_btn, json_output
+            ],
+            show_progress=True
+        )
+        # Clear Data
+        clear_btn.click(
+            fn=clear_all_data,
+            outputs=[
+                test_image_input, annotator,
+                info_display, md_output, md_raw_output,
+                download_btn, json_output
+            ],
+            show_progress=False
+        )
+    return demo
+# ==================== Main Program ====================
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7861,  # Use a different port to avoid conflicts
+        debug=True
+    )

demo/demo_hf.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+if "LOCAL_RANK" not in os.environ:
+    os.environ["LOCAL_RANK"] = "0"
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from dots_ocr.utils import dict_promptmode_to_prompt
+def inference(image_path, prompt, model, processor):
+    # image_path = "demo/demo_image1.jpg"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image_path
+                },
+                {"type": "text", "text": prompt}
+            ]
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=24000)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    print(output_text)
+if __name__ == "__main__":
+    # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+    model_path = "./weights/DotsOCR"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    processor = AutoProcessor.from_pretrained(model_path,  trust_remote_code=True)
+    image_path = "demo/demo_image1.jpg"
+    for prompt_mode, prompt in dict_promptmode_to_prompt.items():
+        print(f"prompt: {prompt}")
+        inference(image_path, prompt, model, processor)

demo/demo_image1.jpg ADDED Viewed

Git LFS Details

SHA256: 90345584ccc2c4a883779e5d47693276e8cf3fe752700af4f03b3142ab46cfa2
Pointer size: 131 Bytes
Size of remote file: 773 kB

demo/demo_pdf1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:570c44a595f52e963d0522fb561b338c327550b37974448f4e4f43c605b72f42
+size 461448

demo/demo_streamlit.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Layout Inference Web Application
+A Streamlit-based layout inference tool that supports image uploads and multiple backend inference engines.
+"""
+import streamlit as st
+import json
+import os
+import io
+import tempfile
+from PIL import Image
+import requests
+# Local utility imports
+# from utils import infer
+from dots_ocr.utils import dict_promptmode_to_prompt
+from dots_ocr.utils.format_transformer import layoutjson2md
+from dots_ocr.utils.layout_utils import draw_layout_on_image, post_process_cells
+from dots_ocr.utils.image_utils import get_input_dimensions, get_image_by_fitz_doc
+from dots_ocr.model.inference import inference_with_vllm
+from dots_ocr.utils.consts import MIN_PIXELS, MAX_PIXELS
+import os
+from PIL import Image
+from dots_ocr.utils.demo_utils.display import read_image
+# ==================== Configuration ====================
+DEFAULT_CONFIG = {
+    'ip': "127.0.0.1",
+    'port_vllm': 8000,
+    'min_pixels': MIN_PIXELS,
+    'max_pixels': MAX_PIXELS,
+    'test_images_dir': "./assets/showcase_origin",
+}
+# ==================== Utility Functions ====================
+@st.cache_resource
+def read_image_v2(img: str):
+    if img.startswith(("http://", "https://")):
+        with requests.get(img, stream=True) as response:
+            response.raise_for_status()
+            img = Image.open(io.BytesIO(response.content))
+    if isinstance(img, str):
+        # img = transform_image_path(img)
+        img, _, _ = read_image(img, use_native=True)
+    elif isinstance(img, Image.Image):
+        pass
+    else:
+        raise ValueError(f"Invalid image type: {type(img)}")
+    return img
+# ==================== UI Components ====================
+def create_config_sidebar():
+    """Create configuration sidebar"""
+    st.sidebar.header("Configuration Parameters")
+    config = {}
+    config['prompt_key'] = st.sidebar.selectbox("Prompt Mode", list(dict_promptmode_to_prompt.keys()))
+    config['ip'] = st.sidebar.text_input("Server IP", DEFAULT_CONFIG['ip'])
+    config['port'] = st.sidebar.number_input("Port", min_value=1000, max_value=9999, value=DEFAULT_CONFIG['port_vllm'])
+    # config['eos_word'] = st.sidebar.text_input("EOS Word", DEFAULT_CONFIG['eos_word'])
+    # Image configuration
+    st.sidebar.subheader("Image Configuration")
+    config['min_pixels'] = st.sidebar.number_input("Min Pixels", value=DEFAULT_CONFIG['min_pixels'])
+    config['max_pixels'] = st.sidebar.number_input("Max Pixels", value=DEFAULT_CONFIG['max_pixels'])
+    return config
+def get_image_input():
+    """Get image input"""
+    st.markdown("#### Image Input")
+    input_mode = st.pills(label="Select input method", options=["Upload Image", "Enter Image URL/Path", "Select Test Image"], key="input_mode", label_visibility="collapsed")
+    if input_mode == "Upload Image":
+        # File uploader
+        uploaded_file = st.file_uploader("Upload Image", type=["png", "jpg", "jpeg"])
+        if uploaded_file is not None:
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                return tmp_file.name
+    elif input_mode == 'Enter Image URL/Path':
+        # URL input
+        img_url_input = st.text_input("Enter Image URL/Path")
+        return img_url_input
+    elif input_mode == 'Select Test Image':
+        # Test image selection
+        test_images = []
+        test_dir = DEFAULT_CONFIG['test_images_dir']
+        if os.path.exists(test_dir):
+            test_images = [os.path.join(test_dir, name) for name in os.listdir(test_dir)]
+        img_url_test = st.selectbox("Select Test Image", [""] + test_images)
+        return img_url_test
+    else:
+        raise ValueError(f"Invalid input mode: {input_mode}")
+    return None
+def process_and_display_results(output: str, image: Image.Image, config: dict):
+    """Process and display inference results"""
+    prompt, response = output['prompt'], output['response']
+    try:
+        col1, col2 = st.columns(2)
+        # st.markdown('---')
+        cells = json.loads(response)
+        # image = Image.open(img_url)
+        # Post-processing
+        cells = post_process_cells(
+            image, cells,
+            image.width, image.height,
+            min_pixels=config['min_pixels'],
+            max_pixels=config['max_pixels']
+        )
+        # Calculate input dimensions
+        input_width, input_height = get_input_dimensions(
+            image,
+            min_pixels=config['min_pixels'],
+            max_pixels=config['max_pixels']
+        )
+        st.markdown('---')
+        st.write(f'Input Dimensions: {input_width} x {input_height}')
+        # st.write(f'Prompt: {prompt}')
+        # st.markdown(f'模型原始输出: <span style="color:blue">{result}</span>', unsafe_allow_html=True)
+        # st.write('模型原始输出：')
+        # st.write(response)
+        # st.write('后处理结果:', str(cells))
+        st.text_area('Original Model Output', response, height=200)
+        st.text_area('Post-processed Result', str(cells), height=200)
+        # 显示结果
+        # st.title("Layout推理结果")
+        with col1:
+            # st.markdown("##### 可视化结果")
+            new_image = draw_layout_on_image(
+                image, cells,
+                resized_height=None, resized_width=None,
+                # text_key='text',
+                fill_bbox=True, draw_bbox=True
+            )
+            st.markdown('##### Visualization Result')
+            st.image(new_image, width=new_image.width)
+            # st.write(f"尺寸: {new_image.width} x {new_image.height}")
+        with col2:
+            # st.markdown("##### Markdown格式")
+            md_code = layoutjson2md(image, cells, text_key='text')
+            # md_code = fix_streamlit_formula(md_code)
+            st.markdown('##### Markdown Format')
+            st.markdown(md_code, unsafe_allow_html=True)
+    except json.JSONDecodeError:
+        st.error("Model output is not a valid JSON format")
+    except Exception as e:
+        st.error(f"Error processing results: {e}")
+# ==================== Main Application ====================
+def main():
+    """Main application function"""
+    st.set_page_config(page_title="Layout Inference Tool", layout="wide")
+    st.title("🔍 Layout Inference Tool")
+    # Configuration
+    config = create_config_sidebar()
+    prompt = dict_promptmode_to_prompt[config['prompt_key']]
+    st.sidebar.info(f"Current Prompt: {prompt}")
+    # Image input
+    img_url = get_image_input()
+    start_button = st.button('🚀 Start Inference', type="primary")
+    if img_url is not None and img_url.strip() != "":
+        try:
+            # processed_image = read_image_v2(img_url)
+            origin_image = read_image_v2(img_url)
+            st.write(f"Original Dimensions: {origin_image.width} x {origin_image.height}")
+            # processed_image = get_image_by_fitz_doc(origin_image, target_dpi=200)
+            processed_image = origin_image
+        except Exception as e:
+            st.error(f"Failed to read image: {e}")
+            return
+    else:
+        st.info("Please enter an image URL/path or upload an image")
+        return
+    output = None
+    # Inference button
+    if start_button:
+        with st.spinner(f"Inferring... Server: {config['ip']}:{config['port']}"):
+            response = inference_with_vllm(
+                processed_image, prompt, config['ip'], config['port'],
+                # config['min_pixels'], config['max_pixels']
+            )
+            output = {
+                'prompt': prompt,
+                'response': response,
+            }
+    else:
+        st.image(processed_image, width=500)
+    # Process results
+    if output:
+        process_and_display_results(output, processed_image, config)
+if __name__ == "__main__":
+    main()

demo/demo_vllm.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import argparse
+import os
+from openai import OpenAI
+from transformers.utils.versions import require_version
+from PIL import Image
+import io
+import base64
+from dots_ocr.utils import dict_promptmode_to_prompt
+from dots_ocr.model.inference import inference_with_vllm
+parser = argparse.ArgumentParser()
+parser.add_argument("--ip", type=str, default="localhost")
+parser.add_argument("--port", type=str, default="8000")
+parser.add_argument("--model_name", type=str, default="model")
+parser.add_argument("--prompt_mode", type=str, default="prompt_layout_all_en")
+args = parser.parse_args()
+require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
+def main():
+    addr = f"http://{args.ip}:{args.port}/v1"
+    image_path = "demo/demo_image1.jpg"
+    prompt = dict_promptmode_to_prompt[args.prompt_mode]
+    image = Image.open(image_path)
+    response = inference_with_vllm(
+        image,
+        prompt,
+        ip=args.ip,
+        port=args.port,
+        temperature=0.1,
+        top_p=0.9,
+        model_name=args.model_name,
+    )
+    print(f"response: {response}")
+if __name__ == "__main__":
+    main()

demo/launch_model_vllm.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+# download model to /path/to/model
+if [ -z "$NODOWNLOAD" ]; then
+    python3 tools/download_model.py
+fi
+# register model to vllm
+hf_model_path=./weights/DotsOCR  # Path to your downloaded model weights
+export PYTHONPATH=$(dirname "$hf_model_path"):$PYTHONPATH
+sed -i '/^from vllm\.entrypoints\.cli\.main import main$/a\
+from DotsOCR import modeling_dots_ocr_vllm' `which vllm`
+# launch vllm server
+model_name=model
+CUDA_VISIBLE_DEVICES=0 vllm serve ${hf_model_path} --tensor-parallel-size 1 --gpu-memory-utilization 0.95  --chat-template-content-format string --served-model-name ${model_name} --trust-remote-code
+# # run python demo after launch vllm server
+# python demo/demo_vllm.py

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,4 @@

+from vllm/vllm-openai:v0.9.1
+RUN pip3 install flash_attn==2.8.0.post2
+RUN pip3 install transformers==4.51.3

docker/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,44 @@

+version: '3.8'
+services:
+  dots-ocr-server:
+    image: dots-ocr:latest
+    container_name: dots-ocr-container
+    ports:
+      - "8000:8000"
+    volumes:
+      #download model to local，model url：https://www.modelscope.cn/models/rednote-hilab/dots.ocr
+      - ./model/dots.ocr:/workspace/weights/DotsOCR
+    environment:
+      - PYTHONPATH=/workspace/weights:$PYTHONPATH
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              device_ids: ['0']
+    entrypoint: /bin/bash
+    command:
+      - -c
+      - |
+        set -ex;
+        echo '--- Starting setup and server ---';
+        echo 'Modifying vllm entrypoint...';
+        # This sed command patches the vllm entrypoint script to import the custom modeling code.
+        sed -i '/^from vllm\.entrypoints\.cli\.main import main/a from DotsOCR import modeling_dots_ocr_vllm' $(which vllm) && \
+        echo 'vllm script after patch:';
+        # Show the patched part of the vllm script for verification.
+        grep -A 1 'from vllm.entrypoints.cli.main import main' $(which vllm) && \
+        echo 'Starting server...';
+        # Use 'exec' to replace the current shell process with the vllm server,
+        # ensuring logs are properly forwarded to Docker's standard output.
+        exec vllm serve /workspace/weights/DotsOCR \
+            --tensor-parallel-size 1 \
+            --gpu-memory-utilization 0.8 \
+            --chat-template-content-format string \
+            --served-model-name dotsocr-model \
+            --trust-remote-code

dots.ocr LICENSE AGREEMENT ADDED Viewed

	@@ -0,0 +1,109 @@

+dots.ocr LICENSE AGREEMENT
+Effective Date: [ August 8, 2025]
+Copyright Holder: [Xingyin Information Technology (Shanghai) Co., Ltd]
+This License Agreement (“Agreement”) governs Your use, reproduction, modification, and distribution of dots.ocr (the "Model Materials"). This Agreement is designed to maximize the openness and use of the Model Materials while addressing the unique legal, ethical, and technical challenges posed by large language models.
+WHEREAS, Licensor has developed the dots.ocr document parsing model and intends to distribute the Model Materials under an open‑source framework;
+WHEREAS, traditional open-source licenses (e.g., the MIT License) may not fully address the complexity inherent complexities of document parsing models, namely their multiple components (code, weights, training data), potential ethical risks, data‑governance issues, and intellectual‑property and liability questions regarding AI‑generated content;
+WHEREAS, Licensor seeks to provide a legal framework that ensures maximum access to and use of the Model Materials while clearly defining the rights, obligations, and liabilities of Licensee;
+THEREFORE, the parties agree that, subject to the MIT License, they shall be bound by the following terms and conditions:
+1. Definitions and Interpretation
+Purpose: To define key terms used in this Agreement, particularly "Model Materials," ensuring clarity of the license scope beyond traditional software code. To clarify the order of precedence between this Agreement and the MIT License to avoid conflict.
+1.1 “Licensor” shall mean the entity providing the Model Materials under this Agreement, namely [Xingyin Information Technology (Shanghai) Co., Ltd].
+1.2 “Licensee” or "You" shall mean any individual or entity exercising permissions granted by this Agreement.
+1.3 “Model Materials” shall mean all materials provided by Licensor under this Agreement, including but not limited to:
+        (a) one or more machine‑learning models, including architecture and trained parameters (i.e., model weights);
+        (b) all associated preprocessing, training, inference, and fine‑tuning code;
+        (c) training datasets and evaluation scripts (or their detailed descriptions and access mechanisms); and
+        (d) any accompanying documentation, metadata, and tools.
+The above Model Materials shall be subject to the content published on the Licensor’s website or GitHub repository at https://github.com/rednote-hilab/dots.ocr.
+1.4 “Outputs” shall mean any content generated through the use of the Model Materials, such as text, tables, code,layout information, and formulas extracted from documents.
+1.5 “MIT License” shall mean The MIT Open Source License published by the Massachusetts Institute of Technology.
+1.6   Priority of Agreement. In the event of any conflict or inconsistency between this Agreement and the MIT License, the terms of the MIT License shall prevail. However, if the terms of the MIT License are ambiguous or silent on a particular matter, the provisions of this Agreement shall apply and supplement the MIT License.
+2. Grant of Rights and Scope of Use
+Purpose: To grant broad, permissive rights to the Licensee for the Model Materials—including code, weights, data, and documentation—to ensure maximum openness and flexibility while clarifying the free use of model-generated content. Additionally, it clarifies the feasibility of transitioning from open-source to commercial‑use and the use of OpenAPI interfaces.
+2.1   Grant of Copyright License. Subject to Licensee's compliance with this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non‑exclusive, no-charge, royalty‑free copyright license to use (run or test), reproduce, modify, create derivative works of, merge, publish, distribute the Model Materials; sublicense and/or sell copies of the Model Materials or any derivative works thereof; and incorporate the unmodified or modified Model Materials into proprietary products or services, including for commercial purposes, software‑as‑a‑service (SaaS) offerings, or via OpenAPI or other interfaces.
+2.2   Fundamental Capabilities. The Model Materials only provide the fundamental model’s capabilities. Licensees may develop derivative AI applications or undertake task‑specific training thereon.
+2.3   From Open Source to Commercial Use. The open-source release does not preclude Licensor’s commercial exploitation of the Model Materials, in whole or in part. Any such commercial use shall, at that time, be subject to license agreements between Licensor and applicable users.
+2.4   API‑Service Exception. Licensees who access the Model Materials through API calls or provide model services via API interfaces(without directly distributing model weights )shall not be subject to this Agreement unless otherwise expressly agreed. Instead, such use shall be governed by the API terms of use published by Licensor (if any).
+3. Acceptable Use Policy and Prohibited Uses
+3.1   Responsible Use. Licensee must use the Model Materials in a responsible, ethical, and lawful manner, in compliance with all applicable laws, regulations, industry standards, and best practices.
+3.2   Enterprise On‑Premises Deployment. The Licensee may deploy the Model Materials in closed‑source, on‑premises enterprise environments.
+3.3   Prohibited Uses. Any breach of the prohibitions below will result in the automatic termination of all licenses granted under this Agreement. Licensee agrees not to use the Model Materials or any derivative works thereof, in connection with:
+(a) Identification and Utilization of Illegal/Harmful Content:Includes identifying graphic/text materials used for counterfeiting certificates/invoices, perpetrating fraud, or launching cyberattacks; or processing images containing illegal content such as violence, criminal activities, disinformation, or child exploitation.
+(b) Privacy Infringement and Discriminatory Practices:Extracting personal sensitive information (e.g., ID numbers, medical records, biometric data) or protected characteristics (e.g., race, gender) from images without legal authorization or consent, for purposes of privacy violation, automated discriminatory decision-making, or harassment.
+(c) Copyright Restrictions:Licensees shall not use the tool for unauthorized digitization of publications/document scanning or bulk scraping of content. Any use involving publications or other copyright-protected materials must first obtain relevant permissions.
+4. Intellectual Property Ownership and Contributions
+4.1   Licensor's Copyright Reservation. Licensor reserves all right, title, and interest in and to the Model Materials (including the model architecture, parameters, code, and original training data), except as expressly licensed herein. The original copyright of the Model Materials belongs to the Licensor.
+4.2   Patent License. Subject to the terms and conditions of this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model Materials, where such license applies only to those patent claims licensable by the Lisensor that are necessarily infringed by its contribution(s).
+If Licensee institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model Materials constitute direct or contributory patent infringement, then any patent licenses granted under this License for the Model Materials shall terminate as of the date such litigation is asserted or filed.
+4.3   Outputs: The Outputs generated through the use of the Model Materials generally refer to text, tables, layouts, and other content extracted from documents or images. The extracted content itself does not generate new intellectual property rights, and all intellectual property remains with the original authors or copyright holders. The Licensee is responsible for due diligence regarding the legality of the Outputs, particularly where the content extracted by the OCR model may be substantially similar to existing copyrighted works, which could present intellectual property infringement risks. The Licensor assumes no liability for such infringements.
+4.4   Trademarks. Nothing in this License permits Licensee to make use of Licensor’s trademarks, trade names, logos (e.g., “rednote,” “Xiaohongshu,” “dots.ocr”) or to otherwise suggest endorsement or misrepresent the relationship between the parties, unless Licensor’s prior written approval is granted.
+5. Data Governance, Privacy, and Security
+5.1   Data Quality and Bias. Licensee shall use training data from lawful sources and is encouraged to conduct due diligence before deploying the Model Materials and to take reasonable steps to mitigate any known biases in its training data or applications.
+5.2   Privacy Protection.
+        (a) Sensitive‑Data Restrictions. It is prohibited to use the Model Materials to process,or extract infer sensitive personal data protected under specific laws (such as GDPR or HIPAA), particularly when dealing with documents containing personally identifiable information (such as ID numbers, health data, financial information, etc.), unless Licensee has obtained all necessary consents, lawful basis, or authorizations, and has implemented adequate anonymization, pseudonymization, or other privacy-enhancing technologies.
+        (b) Data Minimization and Purpose Limitation. The Licensee shall follow the principle of data minimization when using the OCR Model, processing only the user data necessary for specific, explicit, and lawful purposes. Specifically, the OCR Model should avoid processing unnecessary sensitive data and ensure compliance with applicable privacy protection laws during data handling.
+        (c) Transparency. Licensee shall provide clear and transparent privacy policies and terms of use when processing user data, particularly during document scanning and information extraction. .
+5.3   Security Measures. Licensee shall implement appropriate technical and administrative safeguards to protect the Model Materials and any associated data against unauthorized access, disclosure, alteration, or destruction. Such measures may include, but are not limited to, encryption, access controls, logging, and audit trails.
+5.4   Further Training. Licensee may only use user‑provided input or Outputs for training, fine-tuning, or improving other AI models if it has obtained the specific and informed consent of data subjects.
+6. Disclaimer of Warranty and Limitation of Liability
+6.1 “AS IS” Basis. Unless required by applicable law, the Model Materials are provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. Licensee is solely responsible for determining the appropriateness of using or redistributing the Model Materials and assume any risks associated with the exercise of permissions under this License. Licensor does not provide any warranty of non-infringement but represents that no infringing code has been knowingly included.
+6.2   Outputs Disclaimer. As a neutral technology, Licensor disclaims all liability for the accuracy, completeness, reliability, safety, legality, or suitability of any Outputs. The Licensee is solely responsible for verifying the accuracy and appropriateness of AI-generated content and shall provide appropriate disclosures when publishing or relying upon such content.
+6.3   Limitation of Liability and Recourse. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall Licensor or contributors be liable for any claims, damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model Materials (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Licensor has been advised of the possibility of such damages. If such losses are incurred, recourse may be sought against the Licensee responsible for causing the loss.
+6.4   Content‑Filtering Disclaimer. Although the Model Materials may include content‑filtering mechanisms, Licensor makes no warranties of any kind regarding the stability, quality, accuracy, completeness, or any specific outcome of Outputs. Licensee is solely responsible for reviewing, verifying, and performing quality control on Outputs and assumes all associated risks and liabilities.
+7. Attribution and License Reservation
+7.1   License. When distributing or redistributing the Model Materials, Licensee must give any other recipients of the Model Materials a copy of this Agreement.
+7.2   Copyright and Notices. When distributing any part of the Model Materials, Licensee must retain all copyright, patent, trademark, and attribution notices included in the Model Materials.
+7.3   Attribution. Licensee is encouraged to prominently display the name of Licensor and the Model Materials in any public statements, products, or services that contain the Model Materials (or any derivative works thereof), to promote transparency and community trust. If Licensee distributes modified weights or fine‑tuned models based on the Model Materials, Licensee must prominently display the following statement in the related website or documentation: “Built with dots.ocr.”
+8. Governing Law and Dispute Resolution
+8.1   Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the People’s Republic of China, without regard to its conflict of laws principles.
+8.2   Dispute Resolution. Any dispute claim, or disagreement arising out of or relating to this Agreement shall first be resolved through amicable consultation. If such consultation fails, the dispute shall be submitted to the Hangzhou Arbitration Commission for arbitration. The arbitration shall be conducted in accordance with the laws of China, and the place of arbitration shall be [Hangzhou, China]. The arbitral award shall be final and binding upon both parties.
+9. Regulatory Compliance Amendments
+In the event that any part of this Agreement becomes invalid or requires adjustment due to changes in applicable laws or regulations, Licensor reserves the right to issue a revised version of this Agreement. Licensee shall migrate to the new version within [e.g., ninety (90)] days of its release; otherwise, all rights granted under this Agreement shall automatically terminate.
+10. Security Reporting
+Licensee discovering any security vulnerability in the Model Materials may report it to Licensor via: dots-feedback@xiaohongshu.com. Licensee shall not disclose vulnerability details until Licensor issues an official remediation, unless otherwise required by law.

dots_ocr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .parser import DotsOCRParser

dots_ocr/model/inference.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+import io
+import base64
+import math
+from PIL import Image
+import requests
+from dots_ocr.utils.image_utils import PILimage_to_base64
+from openai import OpenAI
+import os
+def inference_with_vllm(
+        image,
+        prompt,
+        ip="localhost",
+        port=8000,
+        temperature=0.1,
+        top_p=0.9,
+        max_completion_tokens=32768,
+        model_name='model',
+        ):
+    addr = f"http://{ip}:{port}/v1"
+    client = OpenAI(api_key="{}".format(os.environ.get("API_KEY", "0")), base_url=addr)
+    messages = []
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url":  PILimage_to_base64(image)},
+                },
+                {"type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"}  # if no "<|img|><|imgpad|><|endofimg|>" here,vllm v1 will add "\n" here
+            ],
+        }
+    )
+    try:
+        response = client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            max_completion_tokens=max_completion_tokens,
+            temperature=temperature,
+            top_p=top_p)
+        response = response.choices[0].message.content
+        return response
+    except requests.exceptions.RequestException as e:
+        print(f"request error: {e}")
+        return None

dots_ocr/parser.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import os
+import json
+from tqdm import tqdm
+from multiprocessing.pool import ThreadPool, Pool
+import argparse
+from dots_ocr.model.inference import inference_with_vllm
+from dots_ocr.utils.consts import image_extensions, MIN_PIXELS, MAX_PIXELS
+from dots_ocr.utils.image_utils import get_image_by_fitz_doc, fetch_image, smart_resize
+from dots_ocr.utils.doc_utils import fitz_doc_to_image, load_images_from_pdf
+from dots_ocr.utils.prompts import dict_promptmode_to_prompt
+from dots_ocr.utils.layout_utils import post_process_output, draw_layout_on_image, pre_process_bboxes
+from dots_ocr.utils.format_transformer import layoutjson2md
+class DotsOCRParser:
+    """
+    parse image or pdf file
+    """
+    def __init__(self,
+            ip='localhost',
+            port=8000,
+            model_name='model',
+            temperature=0.1,
+            top_p=1.0,
+            max_completion_tokens=16384,
+            num_thread=64,
+            dpi = 200,
+            output_dir="./output",
+            min_pixels=None,
+            max_pixels=None,
+            use_hf=False,
+        ):
+        self.dpi = dpi
+        # default args for vllm server
+        self.ip = ip
+        self.port = port
+        self.model_name = model_name
+        # default args for inference
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_completion_tokens = max_completion_tokens
+        self.num_thread = num_thread
+        self.output_dir = output_dir
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.use_hf = use_hf
+        if self.use_hf:
+            self._load_hf_model()
+            print(f"use hf model, num_thread will be set to 1")
+        else:
+            print(f"use vllm model, num_thread will be set to {self.num_thread}")
+        assert self.min_pixels is None or self.min_pixels >= MIN_PIXELS
+        assert self.max_pixels is None or self.max_pixels <= MAX_PIXELS
+    def _load_hf_model(self):
+        import torch
+        from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+        from qwen_vl_utils import process_vision_info
+        model_path = "./weights/DotsOCR"
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        self.processor = AutoProcessor.from_pretrained(model_path,  trust_remote_code=True,use_fast=True)
+        self.process_vision_info = process_vision_info
+    def _inference_with_hf(self, image, prompt):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image
+                    },
+                    {"type": "text", "text": prompt}
+                ]
+            }
+        ]
+        # Preparation for inference
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        image_inputs, video_inputs = self.process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        # Inference: Generation of the output
+        generated_ids = self.model.generate(**inputs, max_new_tokens=24000)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        response = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return response
+    def _inference_with_vllm(self, image, prompt):
+        response = inference_with_vllm(
+            image,
+            prompt,
+            model_name=self.model_name,
+            ip=self.ip,
+            port=self.port,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_completion_tokens=self.max_completion_tokens,
+        )
+        return response
+    def get_prompt(self, prompt_mode, bbox=None, origin_image=None, image=None, min_pixels=None, max_pixels=None):
+        prompt = dict_promptmode_to_prompt[prompt_mode]
+        if prompt_mode == 'prompt_grounding_ocr':
+            assert bbox is not None
+            bboxes = [bbox]
+            bbox = pre_process_bboxes(origin_image, bboxes, input_width=image.width, input_height=image.height, min_pixels=min_pixels, max_pixels=max_pixels)[0]
+            prompt = prompt + str(bbox)
+        return prompt
+    # def post_process_results(self, response, prompt_mode, save_dir, save_name, origin_image, image, min_pixels, max_pixels)
+    def _parse_single_image(
+        self,
+        origin_image,
+        prompt_mode,
+        save_dir,
+        save_name,
+        source="image",
+        page_idx=0,
+        bbox=None,
+        fitz_preprocess=False,
+        ):
+        min_pixels, max_pixels = self.min_pixels, self.max_pixels
+        if prompt_mode == "prompt_grounding_ocr":
+            min_pixels = min_pixels or MIN_PIXELS  # preprocess image to the final input
+            max_pixels = max_pixels or MAX_PIXELS
+        if min_pixels is not None: assert min_pixels >= MIN_PIXELS, f"min_pixels should >= {MIN_PIXELS}"
+        if max_pixels is not None: assert max_pixels <= MAX_PIXELS, f"max_pixels should <+ {MAX_PIXELS}"
+        if source == 'image' and fitz_preprocess:
+            image = get_image_by_fitz_doc(origin_image, target_dpi=self.dpi)
+            image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
+        else:
+            image = fetch_image(origin_image, min_pixels=min_pixels, max_pixels=max_pixels)
+        input_height, input_width = smart_resize(image.height, image.width)
+        prompt = self.get_prompt(prompt_mode, bbox, origin_image, image, min_pixels=min_pixels, max_pixels=max_pixels)
+        if self.use_hf:
+            response = self._inference_with_hf(image, prompt)
+        else:
+            response = self._inference_with_vllm(image, prompt)
+        result = {'page_no': page_idx,
+            "input_height": input_height,
+            "input_width": input_width
+        }
+        if source == 'pdf':
+            save_name = f"{save_name}_page_{page_idx}"
+        if prompt_mode in ['prompt_layout_all_en', 'prompt_layout_only_en', 'prompt_grounding_ocr']:
+            cells, filtered = post_process_output(
+                response,
+                prompt_mode,
+                origin_image,
+                image,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                )
+            if filtered and prompt_mode != 'prompt_layout_only_en':  # model output json failed, use filtered process
+                json_file_path = os.path.join(save_dir, f"{save_name}.json")
+                with open(json_file_path, 'w', encoding="utf-8") as w:
+                    json.dump(response, w, ensure_ascii=False)
+                image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
+                origin_image.save(image_layout_path)
+                result.update({
+                    'layout_info_path': json_file_path,
+                    'layout_image_path': image_layout_path,
+                })
+                md_file_path = os.path.join(save_dir, f"{save_name}.md")
+                with open(md_file_path, "w", encoding="utf-8") as md_file:
+                    md_file.write(cells)
+                result.update({
+                    'md_content_path': md_file_path
+                })
+                result.update({
+                    'filtered': True
+                })
+            else:
+                try:
+                    image_with_layout = draw_layout_on_image(origin_image, cells)
+                except Exception as e:
+                    print(f"Error drawing layout on image: {e}")
+                    image_with_layout = origin_image
+                json_file_path = os.path.join(save_dir, f"{save_name}.json")
+                with open(json_file_path, 'w', encoding="utf-8") as w:
+                    json.dump(cells, w, ensure_ascii=False)
+                image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
+                image_with_layout.save(image_layout_path)
+                result.update({
+                    'layout_info_path': json_file_path,
+                    'layout_image_path': image_layout_path,
+                })
+                if prompt_mode != "prompt_layout_only_en":  # no text md when detection only
+                    md_content = layoutjson2md(origin_image, cells, text_key='text')
+                    md_content_no_hf = layoutjson2md(origin_image, cells, text_key='text', no_page_hf=True) # used for clean output or metric of omnidocbench、olmbench
+                    md_file_path = os.path.join(save_dir, f"{save_name}.md")
+                    with open(md_file_path, "w", encoding="utf-8") as md_file:
+                        md_file.write(md_content)
+                    md_nohf_file_path = os.path.join(save_dir, f"{save_name}_nohf.md")
+                    with open(md_nohf_file_path, "w", encoding="utf-8") as md_file:
+                        md_file.write(md_content_no_hf)
+                    result.update({
+                        'md_content_path': md_file_path,
+                        'md_content_nohf_path': md_nohf_file_path,
+                    })
+        else:
+            image_layout_path = os.path.join(save_dir, f"{save_name}.jpg")
+            origin_image.save(image_layout_path)
+            result.update({
+                'layout_image_path': image_layout_path,
+            })
+            md_content = response
+            md_file_path = os.path.join(save_dir, f"{save_name}.md")
+            with open(md_file_path, "w", encoding="utf-8") as md_file:
+                md_file.write(md_content)
+            result.update({
+                'md_content_path': md_file_path,
+            })
+        return result
+    def parse_image(self, input_path, filename, prompt_mode, save_dir, bbox=None, fitz_preprocess=False):
+        origin_image = fetch_image(input_path)
+        result = self._parse_single_image(origin_image, prompt_mode, save_dir, filename, source="image", bbox=bbox, fitz_preprocess=fitz_preprocess)
+        result['file_path'] = input_path
+        return [result]
+    def parse_pdf(self, input_path, filename, prompt_mode, save_dir):
+        print(f"loading pdf: {input_path}")
+        images_origin = load_images_from_pdf(input_path, dpi=self.dpi)
+        total_pages = len(images_origin)
+        tasks = [
+            {
+                "origin_image": image,
+                "prompt_mode": prompt_mode,
+                "save_dir": save_dir,
+                "save_name": filename,
+                "source":"pdf",
+                "page_idx": i,
+            } for i, image in enumerate(images_origin)
+        ]
+        def _execute_task(task_args):
+            return self._parse_single_image(**task_args)
+        if self.use_hf:
+            num_thread =  1
+        else:
+            num_thread = min(total_pages, self.num_thread)
+        print(f"Parsing PDF with {total_pages} pages using {num_thread} threads...")
+        results = []
+        with ThreadPool(num_thread) as pool:
+            with tqdm(total=total_pages, desc="Processing PDF pages") as pbar:
+                for result in pool.imap_unordered(_execute_task, tasks):
+                    results.append(result)
+                    pbar.update(1)
+        results.sort(key=lambda x: x["page_no"])
+        for i in range(len(results)):
+            results[i]['file_path'] = input_path
+        return results
+    def parse_file(self,
+        input_path,
+        output_dir="",
+        prompt_mode="prompt_layout_all_en",
+        bbox=None,
+        fitz_preprocess=False
+        ):
+        output_dir = output_dir or self.output_dir
+        output_dir = os.path.abspath(output_dir)
+        filename, file_ext = os.path.splitext(os.path.basename(input_path))
+        save_dir = os.path.join(output_dir, filename)
+        os.makedirs(save_dir, exist_ok=True)
+        if file_ext == '.pdf':
+            results = self.parse_pdf(input_path, filename, prompt_mode, save_dir)
+        elif file_ext in image_extensions:
+            results = self.parse_image(input_path, filename, prompt_mode, save_dir, bbox=bbox, fitz_preprocess=fitz_preprocess)
+        else:
+            raise ValueError(f"file extension {file_ext} not supported, supported extensions are {image_extensions} and pdf")
+        print(f"Parsing finished, results saving to {save_dir}")
+        with open(os.path.join(output_dir, os.path.basename(filename)+'.jsonl'), 'w', encoding="utf-8") as w:
+            for result in results:
+                w.write(json.dumps(result, ensure_ascii=False) + '\n')
+        return results
+def main():
+    prompts = list(dict_promptmode_to_prompt.keys())
+    parser = argparse.ArgumentParser(
+        description="dots.ocr Multilingual Document Layout Parser",
+    )
+    parser.add_argument(
+        "input_path", type=str,
+        help="Input PDF/image file path"
+    )
+    parser.add_argument(
+        "--output", type=str, default="./output",
+        help="Output directory (default: ./output)"
+    )
+    parser.add_argument(
+        "--prompt", choices=prompts, type=str, default="prompt_layout_all_en",
+        help="prompt to query the model, different prompts for different tasks"
+    )
+    parser.add_argument(
+        '--bbox',
+        type=int,
+        nargs=4,
+        metavar=('x1', 'y1', 'x2', 'y2'),
+        help='should give this argument if you want to prompt_grounding_ocr'
+    )
+    parser.add_argument(
+        "--ip", type=str, default="localhost",
+        help=""
+    )
+    parser.add_argument(
+        "--port", type=int, default=8000,
+        help=""
+    )
+    parser.add_argument(
+        "--model_name", type=str, default="model",
+        help=""
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.1,
+        help=""
+    )
+    parser.add_argument(
+        "--top_p", type=float, default=1.0,
+        help=""
+    )
+    parser.add_argument(
+        "--dpi", type=int, default=200,
+        help=""
+    )
+    parser.add_argument(
+        "--max_completion_tokens", type=int, default=16384,
+        help=""
+    )
+    parser.add_argument(
+        "--num_thread", type=int, default=16,
+        help=""
+    )
+    parser.add_argument(
+        "--no_fitz_preprocess", action='store_true',
+        help="False will use tikz dpi upsample pipeline, good for images which has been render with low dpi, but maybe result in higher computational costs"
+    )
+    parser.add_argument(
+        "--min_pixels", type=int, default=None,
+        help=""
+    )
+    parser.add_argument(
+        "--max_pixels", type=int, default=None,
+        help=""
+    )
+    parser.add_argument(
+        "--use_hf", type=bool, default=False,
+        help=""
+    )
+    args = parser.parse_args()
+    dots_ocr_parser = DotsOCRParser(
+        ip=args.ip,
+        port=args.port,
+        model_name=args.model_name,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_completion_tokens=args.max_completion_tokens,
+        num_thread=args.num_thread,
+        dpi=args.dpi,
+        output_dir=args.output,
+        min_pixels=args.min_pixels,
+        max_pixels=args.max_pixels,
+        use_hf=args.use_hf,
+    )
+    fitz_preprocess = not args.no_fitz_preprocess
+    if fitz_preprocess:
+        print(f"Using fitz preprocess for image input, check the change of the image pixels")
+    result = dots_ocr_parser.parse_file(
+        args.input_path,
+        prompt_mode=args.prompt,
+        bbox=args.bbox,
+        fitz_preprocess=fitz_preprocess,
+        )
+if __name__ == "__main__":
+    main()

dots_ocr/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .prompts import dict_promptmode_to_prompt