Spaces:

lsextractor
/

deepseek-ocr2-api

Running

App Files Files Community

lsextractor commited on Feb 5

Commit

74c8e47

verified ·

1 Parent(s): 1504305

Deploy deepseek-ai/DeepSeek-OCR-2

Browse files

Files changed (5) hide show

.env +3 -0
README.md +105 -5
app.py +191 -0
requirements.txt +24 -0
runtime.txt +1 -0

.env ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_NAME=deepseek-ai/DeepSeek-OCR-2
+MODEL_DTYPE=float16
+MAX_IMAGE_SIZE=2048

README.md CHANGED Viewed

@@ -1,12 +1,112 @@
 ---
-title: Deepseek Ocr2 Api
-emoji: 😻
-colorFrom: gray
 colorTo: purple
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DeepSeek OCR-2 API
+emoji: 🔍
+colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 4.31.0
+python_version: 3.11
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# DeepSeek-OCR-2 Table Structure Recognition API
+High-accuracy OCR and table structure recognition using DeepSeek-OCR-2 (3B parameters).
+## Features
+- 📊 **Table Detection & Recognition**: Extract complex table structures
+- 📦 **Cell-Level Bounding Boxes**: Precise coordinates for all cells
+- 📋 **Header Detection**: Automatic header identification
+- 🔗 **Merged Cells**: Rowspan/colspan support
+- 🎯 **High Accuracy**: State-of-the-art performance
+## API Usage
+### Python Client
+```python
+import requests
+import base64
+# Load and encode image
+with open("document.png", "rb") as f:
+    image_b64 = base64.b64encode(f.read()).decode()
+# Call API
+response = requests.post(
+    "https://your-username-space-name.hf.space/api/predict",
+    json={"data": [image_b64]},
+    headers={"Authorization": f"Bearer {YOUR_HF_TOKEN}"}
+)
+result = response.json()
+print(result)
+```
+### cURL
+```bash
+curl -X POST https://your-username-space-name.hf.space/api/predict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer YOUR_HF_TOKEN" \
+  -d '{"data": ["base64_encoded_image"]}'
+```
+## Response Format
+```json
+{
+  "status": "success",
+  "tables": [
+    {
+      "bbox": [x1, y1, x2, y2],
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "rowSpan": 1,
+          "colSpan": 1,
+          "bbox": [x1, y1, x2, y2],
+          "text": "Cell content"
+        }
+      ],
+      "headers": [...],
+      "rows": [...]
+    }
+  ],
+  "blocks": [...],
+  "text": "Extracted text...",
+  "metadata": {
+    "model": "deepseek-ai/DeepSeek-OCR-2",
+    "device": "cuda",
+    "image_size": [width, height]
+  }
+}
+```
+## Model Info
+- **Model:** deepseek-ai/DeepSeek-OCR-2
+- **Parameters:** 3B
+- **Precision:** FP16
+- **GPU:** T4 (16GB VRAM)
+- **License:** Apache-2.0
+## Links
+- [Model on HuggingFace](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2)
+- [Project Repository](https://git.epam.com/epm-gpt/badgerdoc/ls-extractor)
+- [Documentation](https://git.epam.com/epm-gpt/badgerdoc/ls-extractor/-/tree/main/docs)
+## Citation
+```bibtex
+@article{deepseek-ocr-2,
+  title={DeepSeek-OCR-2: Advanced Document Understanding},
+  author={DeepSeek AI},
+  year={2026}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python3
+"""
+HuggingFace Spaces Gradio App for DeepSeek-OCR-2
+Uses lazy loading to avoid startup timeout on free CPU tier.
+"""
+import os
+import sys
+import traceback
+import time
+import threading
+import gradio as gr
+import torch
+from PIL import Image
+# Configuration
+MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR-2")
+MODEL_DTYPE = os.getenv("MODEL_DTYPE", "float16")
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE_MAP = {
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "float32": torch.float32,
+}
+TORCH_DTYPE = DTYPE_MAP.get(MODEL_DTYPE, torch.float16)
+# Global state for lazy loading
+_model = None
+_processor = None
+_model_lock = threading.Lock()
+_loading = False
+_load_error = None
+print(f"🚀 App starting (lazy model loading)")
+print(f"📍 Device: {DEVICE}")
+print(f"🔢 Dtype: {MODEL_DTYPE}")
+print(f"📦 Model: {MODEL_NAME}")
+def get_model():
+    """Lazy load the model on first request."""
+    global _model, _processor, _loading, _load_error
+    if _model is not None:
+        return _model, _processor
+    with _model_lock:
+        # Double-check after acquiring lock
+        if _model is not None:
+            return _model, _processor
+        if _load_error:
+            raise RuntimeError(f"Model failed to load: {_load_error}")
+        _loading = True
+        print(f"⏳ Loading model: {MODEL_NAME}...")
+        try:
+            from transformers import AutoModel, AutoProcessor
+            model = AutoModel.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=TORCH_DTYPE,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                token=HF_TOKEN,
+            )
+            processor = AutoProcessor.from_pretrained(
+                MODEL_NAME,
+                trust_remote_code=True,
+                token=HF_TOKEN,
+            )
+            model = model.to(DEVICE).eval()
+            _model = model
+            _processor = processor
+            _loading = False
+            print(f"✅ Model loaded successfully on {DEVICE}")
+            return _model, _processor
+        except Exception as e:
+            _load_error = str(e)
+            _loading = False
+            print(f"❌ Failed to load model: {e}", file=sys.stderr)
+            traceback.print_exc()
+            raise
+def run_ocr(image):
+    """Process image and return OCR results as text."""
+    if image is None:
+        return "Error: No image provided"
+    try:
+        model, processor = get_model()
+    except Exception as e:
+        return f"Error loading model: {str(e)}\n\nThis may be due to memory constraints on free CPU tier.\nConsider upgrading to GPU hardware."
+    try:
+        # Preprocess
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        w, h = image.size
+        if max(w, h) > MAX_IMAGE_SIZE:
+            scale = MAX_IMAGE_SIZE / max(w, h)
+            image = image.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
+        start = time.time()
+        # Run inference
+        if hasattr(model, 'chat'):
+            response = model.chat(
+                processor,
+                image,
+                "Extract all text from this image.",
+                history=[],
+            )
+            text = response if isinstance(response, str) else str(response)
+        else:
+            inputs = processor(images=image, return_tensors="pt")
+            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+            if hasattr(outputs, 'logits'):
+                ids = outputs.logits.argmax(-1)
+                text = processor.batch_decode(ids, skip_special_tokens=True)[0]
+            else:
+                text = str(outputs)
+        elapsed = time.time() - start
+        # Build result
+        result = f"=== OCR Result ===\n\n{text}\n\n"
+        result += f"--- Metadata ---\n"
+        result += f"Model: {MODEL_NAME}\n"
+        result += f"Device: {DEVICE}\n"
+        result += f"Time: {elapsed:.2f}s\n"
+        return result
+    except Exception as e:
+        return f"Error: {str(e)}\n\n{traceback.format_exc()}"
+def get_status():
+    """Return current status without loading model."""
+    lines = [
+        "=== DeepSeek-OCR-2 Status ===",
+        "",
+        f"Model: {MODEL_NAME}",
+        f"Device: {DEVICE}",
+        f"Dtype: {MODEL_DTYPE}",
+        f"CUDA Available: {torch.cuda.is_available()}",
+        "",
+        f"Model Loaded: {'Yes' if _model is not None else 'No (loads on first request)'}",
+    ]
+    if _loading:
+        lines.append("Currently loading model...")
+    if _load_error:
+        lines.append(f"Error: {_load_error}")
+    lines.extend([
+        "",
+        "Note: Model loads on first OCR request to avoid startup timeout.",
+        "First request may take 1-2 minutes on CPU.",
+    ])
+    return "\n".join(lines)
+# Simple Gradio Interface using gr.Interface to avoid schema bugs
+demo = gr.Interface(
+    fn=run_ocr,
+    inputs=gr.Image(type="pil", label="Upload Image"),
+    outputs=gr.Textbox(label="OCR Result", lines=20),
+    title="DeepSeek-OCR-2",
+    description=f"Upload an image to extract text. Model: {MODEL_NAME} | Device: {DEVICE}\n\nNote: First request loads the model (~1-2 min on CPU).",
+    allow_flagging="never",
+    api_name="ocr"
+)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# HuggingFace Spaces Requirements
+# For DeepSeek-OCR-2 deployment
+# Note: Requires Python 3.11 (see README.md front matter)
+# Core dependencies
+torch==2.6.0
+transformers==4.46.3
+gradio==4.31.0
+# Model dependencies
+pillow>=8.0,<11.0
+safetensors>=0.7.0
+huggingface-hub>=0.19.0,<0.25.0
+tokenizers>=0.20.3
+accelerate>=1.12.0
+# OCR model specific
+einops>=0.7.0
+timm>=0.9.0
+addict>=2.4.0
+easydict>=1.13.0
+# Utilities
+numpy>=1.24.0

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.11