lsextractor commited on
Commit
74c8e47
·
verified ·
1 Parent(s): 1504305

Deploy deepseek-ai/DeepSeek-OCR-2

Browse files
Files changed (5) hide show
  1. .env +3 -0
  2. README.md +105 -5
  3. app.py +191 -0
  4. requirements.txt +24 -0
  5. runtime.txt +1 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_NAME=deepseek-ai/DeepSeek-OCR-2
2
+ MODEL_DTYPE=float16
3
+ MAX_IMAGE_SIZE=2048
README.md CHANGED
@@ -1,12 +1,112 @@
1
  ---
2
- title: Deepseek Ocr2 Api
3
- emoji: 😻
4
- colorFrom: gray
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
 
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DeepSeek OCR-2 API
3
+ emoji: 🔍
4
+ colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.31.0
8
+ python_version: 3.11
9
  app_file: app.py
10
  pinned: false
11
+ license: apache-2.0
12
  ---
13
 
14
+ # DeepSeek-OCR-2 Table Structure Recognition API
15
+
16
+ High-accuracy OCR and table structure recognition using DeepSeek-OCR-2 (3B parameters).
17
+
18
+ ## Features
19
+
20
+ - 📊 **Table Detection & Recognition**: Extract complex table structures
21
+ - 📦 **Cell-Level Bounding Boxes**: Precise coordinates for all cells
22
+ - 📋 **Header Detection**: Automatic header identification
23
+ - 🔗 **Merged Cells**: Rowspan/colspan support
24
+ - 🎯 **High Accuracy**: State-of-the-art performance
25
+
26
+ ## API Usage
27
+
28
+ ### Python Client
29
+
30
+ ```python
31
+ import requests
32
+ import base64
33
+
34
+ # Load and encode image
35
+ with open("document.png", "rb") as f:
36
+ image_b64 = base64.b64encode(f.read()).decode()
37
+
38
+ # Call API
39
+ response = requests.post(
40
+ "https://your-username-space-name.hf.space/api/predict",
41
+ json={"data": [image_b64]},
42
+ headers={"Authorization": f"Bearer {YOUR_HF_TOKEN}"}
43
+ )
44
+
45
+ result = response.json()
46
+ print(result)
47
+ ```
48
+
49
+ ### cURL
50
+
51
+ ```bash
52
+ curl -X POST https://your-username-space-name.hf.space/api/predict \
53
+ -H "Content-Type: application/json" \
54
+ -H "Authorization: Bearer YOUR_HF_TOKEN" \
55
+ -d '{"data": ["base64_encoded_image"]}'
56
+ ```
57
+
58
+ ## Response Format
59
+
60
+ ```json
61
+ {
62
+ "status": "success",
63
+ "tables": [
64
+ {
65
+ "bbox": [x1, y1, x2, y2],
66
+ "cells": [
67
+ {
68
+ "row": 0,
69
+ "col": 0,
70
+ "rowSpan": 1,
71
+ "colSpan": 1,
72
+ "bbox": [x1, y1, x2, y2],
73
+ "text": "Cell content"
74
+ }
75
+ ],
76
+ "headers": [...],
77
+ "rows": [...]
78
+ }
79
+ ],
80
+ "blocks": [...],
81
+ "text": "Extracted text...",
82
+ "metadata": {
83
+ "model": "deepseek-ai/DeepSeek-OCR-2",
84
+ "device": "cuda",
85
+ "image_size": [width, height]
86
+ }
87
+ }
88
+ ```
89
+
90
+ ## Model Info
91
+
92
+ - **Model:** deepseek-ai/DeepSeek-OCR-2
93
+ - **Parameters:** 3B
94
+ - **Precision:** FP16
95
+ - **GPU:** T4 (16GB VRAM)
96
+ - **License:** Apache-2.0
97
+
98
+ ## Links
99
+
100
+ - [Model on HuggingFace](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2)
101
+ - [Project Repository](https://git.epam.com/epm-gpt/badgerdoc/ls-extractor)
102
+ - [Documentation](https://git.epam.com/epm-gpt/badgerdoc/ls-extractor/-/tree/main/docs)
103
+
104
+ ## Citation
105
+
106
+ ```bibtex
107
+ @article{deepseek-ocr-2,
108
+ title={DeepSeek-OCR-2: Advanced Document Understanding},
109
+ author={DeepSeek AI},
110
+ year={2026}
111
+ }
112
+ ```
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HuggingFace Spaces Gradio App for DeepSeek-OCR-2
4
+
5
+ Uses lazy loading to avoid startup timeout on free CPU tier.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import traceback
11
+ import time
12
+ import threading
13
+
14
+ import gradio as gr
15
+ import torch
16
+ from PIL import Image
17
+
18
+ # Configuration
19
+ MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR-2")
20
+ MODEL_DTYPE = os.getenv("MODEL_DTYPE", "float16")
21
+ MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
22
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
23
+
24
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ DTYPE_MAP = {
27
+ "float16": torch.float16,
28
+ "bfloat16": torch.bfloat16,
29
+ "float32": torch.float32,
30
+ }
31
+ TORCH_DTYPE = DTYPE_MAP.get(MODEL_DTYPE, torch.float16)
32
+
33
+ # Global state for lazy loading
34
+ _model = None
35
+ _processor = None
36
+ _model_lock = threading.Lock()
37
+ _loading = False
38
+ _load_error = None
39
+
40
+ print(f"🚀 App starting (lazy model loading)")
41
+ print(f"📍 Device: {DEVICE}")
42
+ print(f"🔢 Dtype: {MODEL_DTYPE}")
43
+ print(f"📦 Model: {MODEL_NAME}")
44
+
45
+
46
+ def get_model():
47
+ """Lazy load the model on first request."""
48
+ global _model, _processor, _loading, _load_error
49
+
50
+ if _model is not None:
51
+ return _model, _processor
52
+
53
+ with _model_lock:
54
+ # Double-check after acquiring lock
55
+ if _model is not None:
56
+ return _model, _processor
57
+
58
+ if _load_error:
59
+ raise RuntimeError(f"Model failed to load: {_load_error}")
60
+
61
+ _loading = True
62
+ print(f"⏳ Loading model: {MODEL_NAME}...")
63
+
64
+ try:
65
+ from transformers import AutoModel, AutoProcessor
66
+
67
+ model = AutoModel.from_pretrained(
68
+ MODEL_NAME,
69
+ torch_dtype=TORCH_DTYPE,
70
+ trust_remote_code=True,
71
+ low_cpu_mem_usage=True,
72
+ token=HF_TOKEN,
73
+ )
74
+ processor = AutoProcessor.from_pretrained(
75
+ MODEL_NAME,
76
+ trust_remote_code=True,
77
+ token=HF_TOKEN,
78
+ )
79
+ model = model.to(DEVICE).eval()
80
+
81
+ _model = model
82
+ _processor = processor
83
+ _loading = False
84
+ print(f"✅ Model loaded successfully on {DEVICE}")
85
+ return _model, _processor
86
+
87
+ except Exception as e:
88
+ _load_error = str(e)
89
+ _loading = False
90
+ print(f"❌ Failed to load model: {e}", file=sys.stderr)
91
+ traceback.print_exc()
92
+ raise
93
+
94
+
95
+ def run_ocr(image):
96
+ """Process image and return OCR results as text."""
97
+ if image is None:
98
+ return "Error: No image provided"
99
+
100
+ try:
101
+ model, processor = get_model()
102
+ except Exception as e:
103
+ return f"Error loading model: {str(e)}\n\nThis may be due to memory constraints on free CPU tier.\nConsider upgrading to GPU hardware."
104
+
105
+ try:
106
+ # Preprocess
107
+ if image.mode != "RGB":
108
+ image = image.convert("RGB")
109
+
110
+ w, h = image.size
111
+ if max(w, h) > MAX_IMAGE_SIZE:
112
+ scale = MAX_IMAGE_SIZE / max(w, h)
113
+ image = image.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
114
+
115
+ start = time.time()
116
+
117
+ # Run inference
118
+ if hasattr(model, 'chat'):
119
+ response = model.chat(
120
+ processor,
121
+ image,
122
+ "Extract all text from this image.",
123
+ history=[],
124
+ )
125
+ text = response if isinstance(response, str) else str(response)
126
+ else:
127
+ inputs = processor(images=image, return_tensors="pt")
128
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
129
+ with torch.no_grad():
130
+ outputs = model(**inputs)
131
+ if hasattr(outputs, 'logits'):
132
+ ids = outputs.logits.argmax(-1)
133
+ text = processor.batch_decode(ids, skip_special_tokens=True)[0]
134
+ else:
135
+ text = str(outputs)
136
+
137
+ elapsed = time.time() - start
138
+
139
+ # Build result
140
+ result = f"=== OCR Result ===\n\n{text}\n\n"
141
+ result += f"--- Metadata ---\n"
142
+ result += f"Model: {MODEL_NAME}\n"
143
+ result += f"Device: {DEVICE}\n"
144
+ result += f"Time: {elapsed:.2f}s\n"
145
+
146
+ return result
147
+
148
+ except Exception as e:
149
+ return f"Error: {str(e)}\n\n{traceback.format_exc()}"
150
+
151
+
152
+ def get_status():
153
+ """Return current status without loading model."""
154
+ lines = [
155
+ "=== DeepSeek-OCR-2 Status ===",
156
+ "",
157
+ f"Model: {MODEL_NAME}",
158
+ f"Device: {DEVICE}",
159
+ f"Dtype: {MODEL_DTYPE}",
160
+ f"CUDA Available: {torch.cuda.is_available()}",
161
+ "",
162
+ f"Model Loaded: {'Yes' if _model is not None else 'No (loads on first request)'}",
163
+ ]
164
+
165
+ if _loading:
166
+ lines.append("Currently loading model...")
167
+ if _load_error:
168
+ lines.append(f"Error: {_load_error}")
169
+
170
+ lines.extend([
171
+ "",
172
+ "Note: Model loads on first OCR request to avoid startup timeout.",
173
+ "First request may take 1-2 minutes on CPU.",
174
+ ])
175
+
176
+ return "\n".join(lines)
177
+
178
+
179
+ # Simple Gradio Interface using gr.Interface to avoid schema bugs
180
+ demo = gr.Interface(
181
+ fn=run_ocr,
182
+ inputs=gr.Image(type="pil", label="Upload Image"),
183
+ outputs=gr.Textbox(label="OCR Result", lines=20),
184
+ title="DeepSeek-OCR-2",
185
+ description=f"Upload an image to extract text. Model: {MODEL_NAME} | Device: {DEVICE}\n\nNote: First request loads the model (~1-2 min on CPU).",
186
+ allow_flagging="never",
187
+ api_name="ocr"
188
+ )
189
+
190
+ if __name__ == "__main__":
191
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Spaces Requirements
2
+ # For DeepSeek-OCR-2 deployment
3
+ # Note: Requires Python 3.11 (see README.md front matter)
4
+
5
+ # Core dependencies
6
+ torch==2.6.0
7
+ transformers==4.46.3
8
+ gradio==4.31.0
9
+
10
+ # Model dependencies
11
+ pillow>=8.0,<11.0
12
+ safetensors>=0.7.0
13
+ huggingface-hub>=0.19.0,<0.25.0
14
+ tokenizers>=0.20.3
15
+ accelerate>=1.12.0
16
+
17
+ # OCR model specific
18
+ einops>=0.7.0
19
+ timm>=0.9.0
20
+ addict>=2.4.0
21
+ easydict>=1.13.0
22
+
23
+ # Utilities
24
+ numpy>=1.24.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11