Upload folder using huggingface_hub
Browse files- README.md +87 -96
- model.safetensors +3 -0
- tokenizer.json +2 -2
- tokenizer.model +2 -2
README.md
CHANGED
|
@@ -40,26 +40,24 @@ PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM for Robust In-the-Wild Document
|
|
| 40 |
[](https://x.com/PaddlePaddle)
|
| 41 |
[](./LICENSE)
|
| 42 |
|
| 43 |
-
**🔥
|
| 44 |
-
**📝 [Technical Report](https://arxiv.org/pdf/2510.14528)
|
| 45 |
|
| 46 |
</div>
|
| 47 |
|
| 48 |
<div align="center">
|
| 49 |
-
<img src="https://
|
| 50 |
</div>
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
## Introduction
|
| 55 |
|
| 56 |
-
**PaddleOCR-VL-1.5 is an upgraded model achieving a new SOTA accuracy of 94.5% on OmniDocBench v1.5**. To rigorously evaluate robustness against real-world physical distortions—including scanning artifacts,
|
| 57 |
|
| 58 |
### **Key Capabilities of PaddleOCR-VL-1.5**
|
| 59 |
|
| 60 |
-
1. With a **parameter size of 0.9B**, PaddleOCR-VL-1.5 **achieves
|
| 61 |
|
| 62 |
-
2. **It introduces an innovative approach to document parsing by supporting irregular-shaped localization**, enabling accurate polygonal detection under skewed and
|
| 63 |
|
| 64 |
3. The model introduces **text spotting (text-line localization and recognition)**, along with **seal recognition**, with all corresponding metrics **setting new SOTA results** in their respective tasks.
|
| 65 |
|
|
@@ -71,13 +69,13 @@ PaddleOCR-VL-1.5: Towards a Multi-Task 0.9B VLM for Robust In-the-Wild Document
|
|
| 71 |
### **Model Architecture**
|
| 72 |
|
| 73 |
<div align="center">
|
| 74 |
-
<img src="https://
|
| 75 |
</div>
|
| 76 |
|
| 77 |
|
| 78 |
## News
|
| 79 |
|
| 80 |
-
* ```2026.01.29``` 🚀 We release [PaddleOCR-VL-1.5](https://
|
| 81 |
|
| 82 |
## Usage
|
| 83 |
|
|
@@ -177,7 +175,7 @@ from transformers import AutoProcessor, AutoModelForImageTextToText
|
|
| 177 |
# ---- Settings ----
|
| 178 |
model_path = "PaddlePaddle/PaddleOCR-VL-1.5"
|
| 179 |
image_path = "test.png"
|
| 180 |
-
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula' | 'spotting'
|
| 181 |
# ------------------
|
| 182 |
|
| 183 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -187,28 +185,11 @@ PROMPTS = {
|
|
| 187 |
"formula": "Formula Recognition:",
|
| 188 |
"chart": "Chart Recognition:",
|
| 189 |
"spotting": "Spotting:",
|
| 190 |
-
"seal": "Seal Recognition:",
|
| 191 |
}
|
| 192 |
|
| 193 |
-
model = AutoModelForImageTextToText.from_pretrained(model_path,
|
| 194 |
processor = AutoProcessor.from_pretrained(model_path)
|
| 195 |
-
|
| 196 |
-
# ---- Image Preprocessing ----
|
| 197 |
image = Image.open(image_path).convert("RGB")
|
| 198 |
-
orig_w, orig_h = image.size
|
| 199 |
-
spotting_upscale_threshold = 1500
|
| 200 |
-
|
| 201 |
-
if task == "spotting" and orig_w < spotting_upscale_threshold and orig_h < spotting_upscale_threshold:
|
| 202 |
-
process_w, process_h = orig_w * 2, orig_h * 2
|
| 203 |
-
try:
|
| 204 |
-
resample_filter = Image.Resampling.LANCZOS
|
| 205 |
-
except AttributeError:
|
| 206 |
-
resample_filter = Image.LANCZOS
|
| 207 |
-
image = image.resize((process_w, process_h), resample_filter)
|
| 208 |
-
|
| 209 |
-
# Set max_pixels: use 1605632 for spotting, otherwise use default ~1M pixels
|
| 210 |
-
max_pixels = 2048 * 28 * 28 if task == "spotting" else 1280 * 28 * 28
|
| 211 |
-
|
| 212 |
messages = [
|
| 213 |
{
|
| 214 |
"role": "user",
|
|
@@ -218,20 +199,15 @@ messages = [
|
|
| 218 |
]
|
| 219 |
}
|
| 220 |
]
|
| 221 |
-
|
| 222 |
inputs = processor.apply_chat_template(
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
image_processor_kwargs={
|
| 229 |
-
"max_pixels": max_pixels,
|
| 230 |
-
"min_pixels": 144 * 28 * 28
|
| 231 |
-
}
|
| 232 |
).to(model.device)
|
| 233 |
|
| 234 |
-
outputs = model.generate(**inputs, max_new_tokens=
|
| 235 |
result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
|
| 236 |
print(result)
|
| 237 |
```
|
|
@@ -252,7 +228,7 @@ from transformers import AutoProcessor, AutoModelForImageTextToText
|
|
| 252 |
# ---- Settings ----
|
| 253 |
model_path = "PaddlePaddle/PaddleOCR-VL-1.5"
|
| 254 |
image_path = "test.png"
|
| 255 |
-
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula' | 'spotting'
|
| 256 |
# ------------------
|
| 257 |
|
| 258 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -262,28 +238,11 @@ PROMPTS = {
|
|
| 262 |
"formula": "Formula Recognition:",
|
| 263 |
"chart": "Chart Recognition:",
|
| 264 |
"spotting": "Spotting:",
|
| 265 |
-
"seal": "Seal Recognition:",
|
| 266 |
}
|
| 267 |
|
| 268 |
model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="bfloat16", attn_implementation="flash_attention_2").to(DEVICE).eval()
|
| 269 |
processor = AutoProcessor.from_pretrained(model_path)
|
| 270 |
-
|
| 271 |
-
# ---- Image Preprocessing ----
|
| 272 |
image = Image.open(image_path).convert("RGB")
|
| 273 |
-
orig_w, orig_h = image.size
|
| 274 |
-
spotting_upscale_threshold = 1500
|
| 275 |
-
|
| 276 |
-
if task == "spotting" and orig_w < spotting_upscale_threshold and orig_h < spotting_upscale_threshold:
|
| 277 |
-
process_w, process_h = orig_w * 2, orig_h * 2
|
| 278 |
-
try:
|
| 279 |
-
resample_filter = Image.Resampling.LANCZOS
|
| 280 |
-
except AttributeError:
|
| 281 |
-
resample_filter = Image.LANCZOS
|
| 282 |
-
image = image.resize((process_w, process_h), resample_filter)
|
| 283 |
-
|
| 284 |
-
# Set max_pixels: use 1605632 for spotting, otherwise use default ~1M pixels
|
| 285 |
-
max_pixels = 2048 * 28 * 28 if task == "spotting" else 1280 * 28 * 28
|
| 286 |
-
|
| 287 |
messages = [
|
| 288 |
{
|
| 289 |
"role": "user",
|
|
@@ -299,13 +258,9 @@ inputs = processor.apply_chat_template(
|
|
| 299 |
tokenize=True,
|
| 300 |
return_dict=True,
|
| 301 |
return_tensors="pt",
|
| 302 |
-
image_processor_kwargs={
|
| 303 |
-
"max_pixels": max_pixels,
|
| 304 |
-
"min_pixels": 144 * 28 * 28
|
| 305 |
-
}
|
| 306 |
).to(model.device)
|
| 307 |
|
| 308 |
-
outputs = model.generate(**inputs, max_new_tokens=
|
| 309 |
result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
|
| 310 |
print(result)
|
| 311 |
```
|
|
@@ -314,109 +269,145 @@ print(result)
|
|
| 314 |
|
| 315 |
## Performance
|
| 316 |
|
| 317 |
-
### Document Parsing
|
| 318 |
|
| 319 |
|
| 320 |
#### 1. OmniDocBench v1.5
|
| 321 |
|
| 322 |
-
##### PaddleOCR-VL
|
| 323 |
|
| 324 |
<div align="center">
|
| 325 |
-
<img src="https://
|
| 326 |
</div>
|
| 327 |
|
| 328 |
|
| 329 |
-
> **Notes:**
|
| 330 |
-
> - Performance metrics are cited from the [OmniDocBench official leaderboard](https://opendatalab.com/omnidocbench), except for Gemini-3 Pro, Qwen3-VL-235B-A22B-Instruct and our model, which were evaluated independently.
|
| 331 |
-
|
| 332 |
|
| 333 |
-
#### 2.
|
| 334 |
|
| 335 |
-
#####
|
| 336 |
|
| 337 |
|
| 338 |
<div align="center">
|
| 339 |
-
<img src="https://
|
| 340 |
</div>
|
| 341 |
|
|
|
|
| 342 |
> **Notes:**
|
| 343 |
-
> -
|
|
|
|
|
|
|
|
|
|
| 344 |
|
|
|
|
| 345 |
|
| 346 |
-
|
| 347 |
|
|
|
|
| 348 |
|
| 349 |
<div align="center">
|
| 350 |
-
<img src="https://
|
| 351 |
</div>
|
| 352 |
|
| 353 |
-
> **Notes:**
|
| 354 |
-
> - End-to-End Inference Performance Comparison on OmniDocBench v1.5. PDF documents were processed in batches of 512 on a single NVIDIA A100 GPU. The reported end-to-end runtime includes both PDF rendering and Markdown generation. All methods rely on their built-in PDF parsing modules and default DPI settings to reflect out-of-the-box performance.
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
-
## Visualization
|
| 358 |
|
| 359 |
|
| 360 |
-
###
|
| 361 |
|
|
|
|
| 362 |
|
| 363 |
-
|
| 364 |
|
| 365 |
<div align="center">
|
| 366 |
-
<img src="https://
|
| 367 |
</div>
|
| 368 |
|
|
|
|
| 369 |
|
| 370 |
-
|
|
|
|
|
|
|
| 371 |
|
| 372 |
<div align="center">
|
| 373 |
-
<img src="https://
|
| 374 |
</div>
|
| 375 |
|
| 376 |
|
| 377 |
-
####
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
<div align="center">
|
| 380 |
-
<img src="https://
|
| 381 |
</div>
|
| 382 |
|
| 383 |
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
<div align="center">
|
| 387 |
-
<img src="https://
|
|
|
|
|
|
|
|
|
|
| 388 |
</div>
|
| 389 |
|
| 390 |
-
|
|
|
|
| 391 |
|
| 392 |
<div align="center">
|
| 393 |
-
<img src="https://
|
|
|
|
| 394 |
</div>
|
| 395 |
|
| 396 |
|
| 397 |
-
###
|
| 398 |
-
|
| 399 |
|
| 400 |
<div align="center">
|
| 401 |
-
<img src="https://
|
|
|
|
| 402 |
</div>
|
| 403 |
|
| 404 |
|
| 405 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
|
|
|
| 407 |
|
| 408 |
<div align="center">
|
| 409 |
-
<img src="https://
|
|
|
|
| 410 |
</div>
|
| 411 |
|
| 412 |
|
| 413 |
## Acknowledgments
|
| 414 |
|
| 415 |
-
We would like to thank [
|
| 416 |
|
| 417 |
## Citation
|
| 418 |
|
| 419 |
-
If you find PaddleOCR-VL
|
| 420 |
|
| 421 |
```bibtex
|
| 422 |
comming soon
|
|
|
|
| 40 |
[](https://x.com/PaddlePaddle)
|
| 41 |
[](./LICENSE)
|
| 42 |
|
| 43 |
+
**🔥 Official Website**: [Baidu AI Studio](https://aistudio.baidu.com/paddleocr) |
|
| 44 |
+
**📝 arXiv**: [Technical Report](https://arxiv.org/pdf/2510.14528)
|
| 45 |
|
| 46 |
</div>
|
| 47 |
|
| 48 |
<div align="center">
|
| 49 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/allmetric.png" width="800"/>
|
| 50 |
</div>
|
| 51 |
|
|
|
|
|
|
|
| 52 |
## Introduction
|
| 53 |
|
| 54 |
+
**PaddleOCR-VL-1.5 is an upgraded model achieving a new SOTA accuracy of 94.5% on OmniDocBench v1.5**. To rigorously evaluate robustness against real-world physical distortions—including scanning artifacts, skewing, curving, screen-photo , and light variations—we propose the Real5-OmniDocBench benchmark. Experimental results demonstrate that this enhanced model attains SOTA performance on the newly curated benchmark. Furthermore, we extend the model’s capabilities by incorporating seal recognition and text spotting tasks, while remaining a 0.9B ultra-compact VLM with high efficiency.
|
| 55 |
|
| 56 |
### **Key Capabilities of PaddleOCR-VL-1.5**
|
| 57 |
|
| 58 |
+
1. With a **parameter size of 0.9B**, PaddleOCR-VL-1.5 **achieves xxx% accuracy on OmniDocBench v1.5**, surpassing the previous SOTA model PaddleOCR-VL. Significant improvements are observed in **table, formula, and text understanding.**
|
| 59 |
|
| 60 |
+
2. **It introduces an innovative approach to document parsing by supporting irregular-shaped localization**, enabling accurate polygonal detection under skewed and curved document conditions. Evaluations across five real-world scenarios—scanning, curving, skewing, screen-photo capture, and light variation—demonstrate superior performance over mainstream open-source and proprietary models.
|
| 61 |
|
| 62 |
3. The model introduces **text spotting (text-line localization and recognition)**, along with **seal recognition**, with all corresponding metrics **setting new SOTA results** in their respective tasks.
|
| 63 |
|
|
|
|
| 69 |
### **Model Architecture**
|
| 70 |
|
| 71 |
<div align="center">
|
| 72 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/paddleocrvl.png" width="800"/>
|
| 73 |
</div>
|
| 74 |
|
| 75 |
|
| 76 |
## News
|
| 77 |
|
| 78 |
+
* ```2026.01.29``` 🚀 We release [PaddleOCR-VL-1.5](https://github.com/PaddlePaddle/PaddleOCR-1.5), —a Multi-Task 0.9B VLM for Robust In-the-Wild Document Parsing.
|
| 79 |
|
| 80 |
## Usage
|
| 81 |
|
|
|
|
| 175 |
# ---- Settings ----
|
| 176 |
model_path = "PaddlePaddle/PaddleOCR-VL-1.5"
|
| 177 |
image_path = "test.png"
|
| 178 |
+
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula' | 'spotting'
|
| 179 |
# ------------------
|
| 180 |
|
| 181 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 185 |
"formula": "Formula Recognition:",
|
| 186 |
"chart": "Chart Recognition:",
|
| 187 |
"spotting": "Spotting:",
|
|
|
|
| 188 |
}
|
| 189 |
|
| 190 |
+
model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="bfloat16").to(DEVICE).eval()
|
| 191 |
processor = AutoProcessor.from_pretrained(model_path)
|
|
|
|
|
|
|
| 192 |
image = Image.open(image_path).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
messages = [
|
| 194 |
{
|
| 195 |
"role": "user",
|
|
|
|
| 199 |
]
|
| 200 |
}
|
| 201 |
]
|
|
|
|
| 202 |
inputs = processor.apply_chat_template(
|
| 203 |
+
messages,
|
| 204 |
+
add_generation_prompt=True,
|
| 205 |
+
tokenize=True,
|
| 206 |
+
return_dict=True,
|
| 207 |
+
return_tensors="pt",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
).to(model.device)
|
| 209 |
|
| 210 |
+
outputs = model.generate(**inputs, max_new_tokens=100)
|
| 211 |
result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
|
| 212 |
print(result)
|
| 213 |
```
|
|
|
|
| 228 |
# ---- Settings ----
|
| 229 |
model_path = "PaddlePaddle/PaddleOCR-VL-1.5"
|
| 230 |
image_path = "test.png"
|
| 231 |
+
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula' | 'spotting'
|
| 232 |
# ------------------
|
| 233 |
|
| 234 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 238 |
"formula": "Formula Recognition:",
|
| 239 |
"chart": "Chart Recognition:",
|
| 240 |
"spotting": "Spotting:",
|
|
|
|
| 241 |
}
|
| 242 |
|
| 243 |
model = AutoModelForImageTextToText.from_pretrained(model_path, dtype="bfloat16", attn_implementation="flash_attention_2").to(DEVICE).eval()
|
| 244 |
processor = AutoProcessor.from_pretrained(model_path)
|
|
|
|
|
|
|
| 245 |
image = Image.open(image_path).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
messages = [
|
| 247 |
{
|
| 248 |
"role": "user",
|
|
|
|
| 258 |
tokenize=True,
|
| 259 |
return_dict=True,
|
| 260 |
return_tensors="pt",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
).to(model.device)
|
| 262 |
|
| 263 |
+
outputs = model.generate(**inputs, max_new_tokens=100)
|
| 264 |
result = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:-1])
|
| 265 |
print(result)
|
| 266 |
```
|
|
|
|
| 269 |
|
| 270 |
## Performance
|
| 271 |
|
| 272 |
+
### Page-Level Document Parsing
|
| 273 |
|
| 274 |
|
| 275 |
#### 1. OmniDocBench v1.5
|
| 276 |
|
| 277 |
+
##### PaddleOCR-VL achieves SOTA performance for overall, text, formula, tables and reading order on OmniDocBench v1.5
|
| 278 |
|
| 279 |
<div align="center">
|
| 280 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni15.png" width="800"/>
|
| 281 |
</div>
|
| 282 |
|
| 283 |
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
+
#### 2. OmniDocBench v1.0
|
| 286 |
|
| 287 |
+
##### PaddleOCR-VL achieves SOTA performance for almost all metrics of overall, text, formula, tables and reading order on OmniDocBench v1.0
|
| 288 |
|
| 289 |
|
| 290 |
<div align="center">
|
| 291 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omni10.png" width="800"/>
|
| 292 |
</div>
|
| 293 |
|
| 294 |
+
|
| 295 |
> **Notes:**
|
| 296 |
+
> - The metrics are from [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), and our own internal evaluations.
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
### Element-level Recognition
|
| 300 |
|
| 301 |
+
#### 1. Text
|
| 302 |
|
| 303 |
+
**Comparison of OmniDocBench-OCR-block Performance**
|
| 304 |
|
| 305 |
+
PaddleOCR-VL’s robust and versatile capability in handling diverse document types, establishing it as the leading method in the OmniDocBench-OCR-block performance evaluation.
|
| 306 |
|
| 307 |
<div align="center">
|
| 308 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/omnibenchocr.png" width="800"/>
|
| 309 |
</div>
|
| 310 |
|
|
|
|
|
|
|
| 311 |
|
| 312 |
+
**Comparison of In-house-OCR Performance**
|
| 313 |
+
|
| 314 |
+
In-house-OCR provides a evaluation of performance across multiple languages and text types. Our model demonstrates outstanding accuracy with the lowest edit distances in all evaluated scripts.
|
| 315 |
+
|
| 316 |
+
<div align="center">
|
| 317 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouseocr.png" width="800"/>
|
| 318 |
+
</div>
|
| 319 |
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
+
#### 2. Table
|
| 323 |
|
| 324 |
+
**Comparison of In-house-Table Performance**
|
| 325 |
|
| 326 |
+
Our self-built evaluation set contains diverse types of table images, such as Chinese, English, mixed Chinese-English, and tables with various characteristics like full, partial, or no borders, book/manual formats, lists, academic papers, merged cells, as well as low-quality, watermarked, etc. PaddleOCR-VL achieves remarkable performance across all categories.
|
| 327 |
|
| 328 |
<div align="center">
|
| 329 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousetable.png" width="600"/>
|
| 330 |
</div>
|
| 331 |
|
| 332 |
+
#### 3. Formula
|
| 333 |
|
| 334 |
+
**Comparison of In-house-Formula Performance**
|
| 335 |
+
|
| 336 |
+
In-house-Formula evaluation set contains simple prints, complex prints, camera scans, and handwritten formulas. PaddleOCR-VL demonstrates the best performance in every category.
|
| 337 |
|
| 338 |
<div align="center">
|
| 339 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhouse-formula.png" width="500"/>
|
| 340 |
</div>
|
| 341 |
|
| 342 |
|
| 343 |
+
#### 4. Chart
|
| 344 |
+
|
| 345 |
+
**Comparison of In-house-Chart Performance**
|
| 346 |
+
|
| 347 |
+
The evaluation set is broadly categorized into 11 chart categories, including bar-line hybrid, pie, 100% stacked bar, area, bar, bubble, histogram, line, scatterplot, stacked area, and stacked bar. PaddleOCR-VL not only outperforms expert OCR VLMs but also surpasses some 72B-level multimodal language models.
|
| 348 |
|
| 349 |
<div align="center">
|
| 350 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/inhousechart.png" width="400"/>
|
| 351 |
</div>
|
| 352 |
|
| 353 |
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
## Visualization
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
### Comprehensive Document Parsing
|
| 363 |
|
| 364 |
<div align="center">
|
| 365 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview1.jpg" width="600"/>
|
| 366 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview2.jpg" width="600"/>
|
| 367 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview3.jpg" width="600"/>
|
| 368 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/overview4.jpg" width="600"/>
|
| 369 |
</div>
|
| 370 |
|
| 371 |
+
|
| 372 |
+
### Text
|
| 373 |
|
| 374 |
<div align="center">
|
| 375 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_english_arabic.jpg" width="300" style="display: inline-block;"/>
|
| 376 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/text_handwriting_02.jpg" width="300" style="display: inline-block;"/>
|
| 377 |
</div>
|
| 378 |
|
| 379 |
|
| 380 |
+
### Table
|
|
|
|
| 381 |
|
| 382 |
<div align="center">
|
| 383 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_01.jpg" width="300" style="display: inline-block;"/>
|
| 384 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/table_02.jpg" width="300" style="display: inline-block;"/>
|
| 385 |
</div>
|
| 386 |
|
| 387 |
|
| 388 |
+
### Formula
|
| 389 |
+
|
| 390 |
+
<div align="center">
|
| 391 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_EN.jpg" width="300" style="display: inline-block;"/>
|
| 392 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/formula_ZH.jpg" width="300" style="display: inline-block;"/>
|
| 393 |
+
</div>
|
| 394 |
+
|
| 395 |
|
| 396 |
+
### Chart
|
| 397 |
|
| 398 |
<div align="center">
|
| 399 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_01.jpg" width="300" style="display: inline-block;"/>
|
| 400 |
+
<img src="https://huggingface.co/datasets/PaddlePaddle/PaddleOCR-VL_demo/resolve/main/imgs/chart_02.jpg" width="300" style="display: inline-block;"/>
|
| 401 |
</div>
|
| 402 |
|
| 403 |
|
| 404 |
## Acknowledgments
|
| 405 |
|
| 406 |
+
We would like to thank [ERNIE](https://github.com/PaddlePaddle/ERNIE), [Keye](https://github.com/Kwai-Keye/Keye), [MinerU](https://github.com/opendatalab/MinerU), [OmniDocBench](https://github.com/opendatalab/OmniDocBench) for providing valuable code, model weights and benchmarks. We also appreciate everyone's contribution to this open-source project!
|
| 407 |
|
| 408 |
## Citation
|
| 409 |
|
| 410 |
+
If you find PaddleOCR-VL helpful, feel free to give us a star and citation.
|
| 411 |
|
| 412 |
```bibtex
|
| 413 |
comming soon
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d557c9d8997ae57ed3b1b33bdf347be878cc335687f32ca105341c16973f8958
|
| 3 |
+
size 1917255968
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45ed88f7769781f2251cbfc8d7f162dd458eddf23ab99051a9db4448c09b5c33
|
| 3 |
+
size 133
|
tokenizer.model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3614d0fee7e3b9a9a00b978752c9cf87bc85984a38ab08f4d7dd6bb8d2e3be83
|
| 3 |
+
size 132
|