ta4tsering commited on
Commit
0ea2759
·
1 Parent(s): e95e9f9

feat: implement dots.ocr API and Gradio interface

Browse files
Files changed (3) hide show
  1. README.md +94 -1
  2. app.py +200 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -10,4 +10,97 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: apache-2.0
11
  ---
12
 
13
+ # Bec Dot.ocr API
14
+
15
+ OCR API powered by [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) -- a multilingual document-parsing vision-language model. This Space provides both a browser UI and a programmatic API optimized for batch processing.
16
+
17
+ ## Quick start
18
+
19
+ ### 1. Install the client
20
+
21
+ ```bash
22
+ pip install gradio_client
23
+ ```
24
+
25
+ ### 2. Process a single image
26
+
27
+ ```python
28
+ from gradio_client import Client
29
+
30
+ client = Client("openpecha/bec-dot.orc-api")
31
+
32
+ result = client.predict(
33
+ "path/to/image.png", # local filepath or URL
34
+ "Extract the text content from this image.", # prompt
35
+ api_name="/predict",
36
+ )
37
+ print(result)
38
+ ```
39
+
40
+ ### 3. Batch-process many images
41
+
42
+ ```python
43
+ import os
44
+ import json
45
+ from pathlib import Path
46
+ from gradio_client import Client, handle_file
47
+
48
+ client = Client("openpecha/bec-dot.orc-api")
49
+
50
+ image_dir = Path("images")
51
+ output_dir = Path("results")
52
+ output_dir.mkdir(exist_ok=True)
53
+
54
+ prompt = "Extract the text content from this image."
55
+
56
+ for img_path in sorted(image_dir.glob("*.png")):
57
+ print(f"Processing {img_path.name} ...")
58
+ result = client.predict(
59
+ handle_file(str(img_path)),
60
+ prompt,
61
+ api_name="/predict",
62
+ )
63
+ out_file = output_dir / f"{img_path.stem}.txt"
64
+ out_file.write_text(result, encoding="utf-8")
65
+ print(f" -> saved to {out_file}")
66
+ ```
67
+
68
+ > **Tip:** The Space uses queuing (`max_size=20`), so requests are processed
69
+ > sequentially and will not time out even for large batches.
70
+
71
+ ### 4. Use a custom prompt
72
+
73
+ The default prompt is `"Extract the text content from this image."` You can
74
+ override it for more specific tasks:
75
+
76
+ ```python
77
+ # Layout-aware JSON extraction
78
+ result = client.predict(
79
+ handle_file("document.png"),
80
+ """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
81
+
82
+ 1. Bbox format: [x1, y1, x2, y2]
83
+ 2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
84
+ 3. Text Extraction & Formatting Rules:
85
+ - Picture: omit the text field.
86
+ - Formula: format as LaTeX.
87
+ - Table: format as HTML.
88
+ - All Others: format as Markdown.
89
+ 4. Output the original text with no translation.
90
+ 5. Sort all layout elements in human reading order.
91
+ 6. Final Output: a single JSON object.""",
92
+ api_name="/predict",
93
+ )
94
+ ```
95
+
96
+ ## API reference
97
+
98
+ | Endpoint | Method | Parameters | Returns |
99
+ |---|---|---|---|
100
+ | `/predict` | POST | `image` (filepath/URL), `prompt` (string) | Raw text or JSON string |
101
+
102
+ ## Model details
103
+
104
+ - **Model:** [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) (1.7B LLM, ~3B total)
105
+ - **Precision:** bfloat16
106
+ - **Capabilities:** text extraction, layout detection, table recognition (HTML), formula parsing (LaTeX), multilingual support
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import gradio as gr
5
+ from PIL import Image
6
+ from huggingface_hub import snapshot_download
7
+ from transformers import AutoModelForCausalLM, AutoProcessor
8
+ from qwen_vl_utils import process_vision_info
9
+
10
+ MODEL_ID = "rednote-hilab/dots.ocr"
11
+ MODEL_DIR = os.path.join(os.path.dirname(__file__), "model_weights")
12
+
13
+ DEFAULT_PROMPT = "Extract the text content from this image."
14
+
15
+
16
+ def patch_configuration_dots(model_path: str) -> None:
17
+ """Patch configuration_dots.py to fix the video_processor TypeError.
18
+
19
+ Recent transformers versions require DotsVLProcessor to explicitly
20
+ declare `attributes` and accept `video_processor=None`.
21
+ See: https://huggingface.co/rednote-hilab/dots.ocr/discussions/38
22
+ """
23
+ config_path = os.path.join(model_path, "configuration_dots.py")
24
+ if not os.path.exists(config_path):
25
+ return
26
+
27
+ with open(config_path, "r") as f:
28
+ source = f.read()
29
+
30
+ if 'attributes = ["image_processor", "tokenizer"]' in source:
31
+ return # already patched
32
+
33
+ patched = source.replace(
34
+ "class DotsVLProcessor(Qwen2_5_VLProcessor):\n"
35
+ " def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):",
36
+
37
+ "class DotsVLProcessor(Qwen2_5_VLProcessor):\n"
38
+ ' attributes = ["image_processor", "tokenizer"]\n'
39
+ " def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):",
40
+ )
41
+
42
+ with open(config_path, "w") as f:
43
+ f.write(patched)
44
+
45
+
46
+ def load_model():
47
+ print(f"Downloading {MODEL_ID} ...")
48
+ model_path = snapshot_download(
49
+ repo_id=MODEL_ID,
50
+ local_dir=MODEL_DIR,
51
+ local_dir_use_symlinks=False,
52
+ )
53
+
54
+ patch_configuration_dots(model_path)
55
+ sys.path.insert(0, model_path)
56
+
57
+ # Try flash_attention_2 first, fall back to sdpa
58
+ attn_impl = "flash_attention_2"
59
+ try:
60
+ import flash_attn # noqa: F401
61
+ except ImportError:
62
+ attn_impl = "sdpa"
63
+
64
+ print(f"Loading model with attn_implementation={attn_impl} ...")
65
+ model = AutoModelForCausalLM.from_pretrained(
66
+ model_path,
67
+ attn_implementation=attn_impl,
68
+ torch_dtype=torch.bfloat16,
69
+ device_map="auto",
70
+ trust_remote_code=True,
71
+ )
72
+
73
+ processor = AutoProcessor.from_pretrained(
74
+ model_path,
75
+ trust_remote_code=True,
76
+ )
77
+
78
+ return model, processor
79
+
80
+
81
+ MODEL, PROCESSOR = load_model()
82
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
83
+
84
+
85
+ def predict(image: Image.Image, prompt: str = DEFAULT_PROMPT) -> str:
86
+ """Run OCR inference on a single image.
87
+
88
+ Args:
89
+ image: PIL Image to process.
90
+ prompt: Instruction for the model.
91
+
92
+ Returns:
93
+ Raw text/JSON generated by dots.ocr.
94
+ """
95
+ if image is None:
96
+ return "Error: no image provided."
97
+
98
+ if not prompt or not prompt.strip():
99
+ prompt = DEFAULT_PROMPT
100
+
101
+ image = image.convert("RGB")
102
+
103
+ messages = [
104
+ {
105
+ "role": "user",
106
+ "content": [
107
+ {"type": "image", "image": image},
108
+ {"type": "text", "text": prompt},
109
+ ],
110
+ }
111
+ ]
112
+
113
+ text = PROCESSOR.apply_chat_template(
114
+ messages, tokenize=False, add_generation_prompt=True
115
+ )
116
+ image_inputs, video_inputs = process_vision_info(messages)
117
+
118
+ inputs = PROCESSOR(
119
+ text=[text],
120
+ images=image_inputs,
121
+ videos=video_inputs,
122
+ padding=True,
123
+ return_tensors="pt",
124
+ ).to(DEVICE)
125
+
126
+ with torch.no_grad():
127
+ generated_ids = MODEL.generate(**inputs, max_new_tokens=24000)
128
+
129
+ generated_ids_trimmed = [
130
+ out_ids[len(in_ids):]
131
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
132
+ ]
133
+
134
+ output_text = PROCESSOR.batch_decode(
135
+ generated_ids_trimmed,
136
+ skip_special_tokens=True,
137
+ clean_up_tokenization_spaces=False,
138
+ )
139
+
140
+ return output_text[0] if output_text else ""
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Gradio UI
145
+ # ---------------------------------------------------------------------------
146
+
147
+ with gr.Blocks(title="dots.ocr API") as demo:
148
+ gr.Markdown(
149
+ """
150
+ # dots.ocr -- OCR API
151
+
152
+ Upload an image and get the extracted text. This Space is optimized for
153
+ **programmatic API access** so you can batch-process hundreds of images from
154
+ an external script.
155
+
156
+ ### Calling the API from Python
157
+
158
+ ```python
159
+ from gradio_client import Client
160
+
161
+ client = Client("openpecha/bec-dot.orc-api")
162
+ result = client.predict(
163
+ "path/to/image.png", # image filepath
164
+ "Extract the text content from this image.", # prompt
165
+ api_name="/predict",
166
+ )
167
+ print(result)
168
+ ```
169
+ """
170
+ )
171
+
172
+ with gr.Row():
173
+ with gr.Column(scale=1):
174
+ img_input = gr.Image(type="pil", label="Upload Image")
175
+ prompt_input = gr.Textbox(
176
+ value=DEFAULT_PROMPT,
177
+ label="Prompt",
178
+ lines=2,
179
+ )
180
+ run_btn = gr.Button("Run OCR", variant="primary")
181
+
182
+ with gr.Column(scale=1):
183
+ output_text = gr.Textbox(
184
+ label="Model Output",
185
+ lines=20,
186
+ show_copy_button=True,
187
+ )
188
+
189
+ run_btn.click(
190
+ fn=predict,
191
+ inputs=[img_input, prompt_input],
192
+ outputs=output_text,
193
+ api_name="predict",
194
+ )
195
+
196
+ demo.queue(max_size=20).launch(
197
+ server_name="0.0.0.0",
198
+ server_port=7860,
199
+ show_error=True,
200
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.57.0
2
+ torch>=2.4.0
3
+ torchvision>=0.19.0
4
+ Pillow>=10.0.0
5
+ accelerate>=1.0.0
6
+ einops>=0.8.0
7
+ qwen-vl-utils>=0.0.8
8
+ huggingface_hub>=0.25.0