Spaces:

openpecha
/

bec-dot.orc-api

Sleeping

App Files Files Community

ta4tsering commited on Feb 23

Commit

0ea2759

1 Parent(s): e95e9f9

feat: implement dots.ocr API and Gradio interface

Browse files

Files changed (3) hide show

README.md +94 -1
app.py +200 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -10,4 +10,97 @@ pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: apache-2.0
 ---
+# Bec Dot.ocr API
+OCR API powered by [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) -- a multilingual document-parsing vision-language model. This Space provides both a browser UI and a programmatic API optimized for batch processing.
+## Quick start
+### 1. Install the client
+```bash
+pip install gradio_client
+```
+### 2. Process a single image
+```python
+from gradio_client import Client
+client = Client("openpecha/bec-dot.orc-api")
+result = client.predict(
+    "path/to/image.png",                        # local filepath or URL
+    "Extract the text content from this image.", # prompt
+    api_name="/predict",
+)
+print(result)
+```
+### 3. Batch-process many images
+```python
+import os
+import json
+from pathlib import Path
+from gradio_client import Client, handle_file
+client = Client("openpecha/bec-dot.orc-api")
+image_dir = Path("images")
+output_dir = Path("results")
+output_dir.mkdir(exist_ok=True)
+prompt = "Extract the text content from this image."
+for img_path in sorted(image_dir.glob("*.png")):
+    print(f"Processing {img_path.name} ...")
+    result = client.predict(
+        handle_file(str(img_path)),
+        prompt,
+        api_name="/predict",
+    )
+    out_file = output_dir / f"{img_path.stem}.txt"
+    out_file.write_text(result, encoding="utf-8")
+    print(f"  -> saved to {out_file}")
+```
+> **Tip:** The Space uses queuing (`max_size=20`), so requests are processed
+> sequentially and will not time out even for large batches.
+### 4. Use a custom prompt
+The default prompt is `"Extract the text content from this image."` You can
+override it for more specific tasks:
+```python
+# Layout-aware JSON extraction
+result = client.predict(
+    handle_file("document.png"),
+    """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+    - Picture: omit the text field.
+    - Formula: format as LaTeX.
+    - Table: format as HTML.
+    - All Others: format as Markdown.
+4. Output the original text with no translation.
+5. Sort all layout elements in human reading order.
+6. Final Output: a single JSON object.""",
+    api_name="/predict",
+)
+```
+## API reference
+| Endpoint | Method | Parameters | Returns |
+|---|---|---|---|
+| `/predict` | POST | `image` (filepath/URL), `prompt` (string) | Raw text or JSON string |
+## Model details
+- **Model:** [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) (1.7B LLM, ~3B total)
+- **Precision:** bfloat16
+- **Capabilities:** text extraction, layout detection, table recognition (HTML), formula parsing (LaTeX), multilingual support

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+import sys
+import torch
+import gradio as gr
+from PIL import Image
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForCausalLM, AutoProcessor
+from qwen_vl_utils import process_vision_info
+MODEL_ID = "rednote-hilab/dots.ocr"
+MODEL_DIR = os.path.join(os.path.dirname(__file__), "model_weights")
+DEFAULT_PROMPT = "Extract the text content from this image."
+def patch_configuration_dots(model_path: str) -> None:
+    """Patch configuration_dots.py to fix the video_processor TypeError.
+    Recent transformers versions require DotsVLProcessor to explicitly
+    declare `attributes` and accept `video_processor=None`.
+    See: https://huggingface.co/rednote-hilab/dots.ocr/discussions/38
+    """
+    config_path = os.path.join(model_path, "configuration_dots.py")
+    if not os.path.exists(config_path):
+        return
+    with open(config_path, "r") as f:
+        source = f.read()
+    if 'attributes = ["image_processor", "tokenizer"]' in source:
+        return  # already patched
+    patched = source.replace(
+        "class DotsVLProcessor(Qwen2_5_VLProcessor):\n"
+        "    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):",
+        "class DotsVLProcessor(Qwen2_5_VLProcessor):\n"
+        '    attributes = ["image_processor", "tokenizer"]\n'
+        "    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):",
+    )
+    with open(config_path, "w") as f:
+        f.write(patched)
+def load_model():
+    print(f"Downloading {MODEL_ID} ...")
+    model_path = snapshot_download(
+        repo_id=MODEL_ID,
+        local_dir=MODEL_DIR,
+        local_dir_use_symlinks=False,
+    )
+    patch_configuration_dots(model_path)
+    sys.path.insert(0, model_path)
+    # Try flash_attention_2 first, fall back to sdpa
+    attn_impl = "flash_attention_2"
+    try:
+        import flash_attn  # noqa: F401
+    except ImportError:
+        attn_impl = "sdpa"
+    print(f"Loading model with attn_implementation={attn_impl} ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        attn_implementation=attn_impl,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    processor = AutoProcessor.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+    )
+    return model, processor
+MODEL, PROCESSOR = load_model()
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def predict(image: Image.Image, prompt: str = DEFAULT_PROMPT) -> str:
+    """Run OCR inference on a single image.
+    Args:
+        image: PIL Image to process.
+        prompt: Instruction for the model.
+    Returns:
+        Raw text/JSON generated by dots.ocr.
+    """
+    if image is None:
+        return "Error: no image provided."
+    if not prompt or not prompt.strip():
+        prompt = DEFAULT_PROMPT
+    image = image.convert("RGB")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    text = PROCESSOR.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = PROCESSOR(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(DEVICE)
+    with torch.no_grad():
+        generated_ids = MODEL.generate(**inputs, max_new_tokens=24000)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = PROCESSOR.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    return output_text[0] if output_text else ""
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(title="dots.ocr API") as demo:
+    gr.Markdown(
+        """
+# dots.ocr -- OCR API
+Upload an image and get the extracted text. This Space is optimized for
+**programmatic API access** so you can batch-process hundreds of images from
+an external script.
+### Calling the API from Python
+```python
+from gradio_client import Client
+client = Client("openpecha/bec-dot.orc-api")
+result = client.predict(
+    "path/to/image.png",                       # image filepath
+    "Extract the text content from this image.", # prompt
+    api_name="/predict",
+)
+print(result)
+```
+"""
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            img_input = gr.Image(type="pil", label="Upload Image")
+            prompt_input = gr.Textbox(
+                value=DEFAULT_PROMPT,
+                label="Prompt",
+                lines=2,
+            )
+            run_btn = gr.Button("Run OCR", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(
+                label="Model Output",
+                lines=20,
+                show_copy_button=True,
+            )
+    run_btn.click(
+        fn=predict,
+        inputs=[img_input, prompt_input],
+        outputs=output_text,
+        api_name="predict",
+    )
+demo.queue(max_size=20).launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    show_error=True,
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers>=4.57.0
+torch>=2.4.0
+torchvision>=0.19.0
+Pillow>=10.0.0
+accelerate>=1.0.0
+einops>=0.8.0
+qwen-vl-utils>=0.0.8
+huggingface_hub>=0.25.0