File size: 7,328 Bytes
74c8e47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dee38d2
74c8e47
 
 
 
dee38d2
 
74c8e47
 
 
 
dee38d2
 
 
74c8e47
dee38d2
74c8e47
dee38d2
 
74c8e47
 
dee38d2
 
 
 
 
 
74c8e47
 
 
 
 
 
 
 
dee38d2
 
74c8e47
dee38d2
74c8e47
dee38d2
74c8e47
 
 
 
 
 
 
 
dee38d2
74c8e47
dee38d2
74c8e47
dee38d2
 
 
 
 
 
 
 
74c8e47
 
dee38d2
74c8e47
 
 
 
 
dee38d2
 
74c8e47
 
 
dee38d2
74c8e47
 
dee38d2
74c8e47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dee38d2
74c8e47
 
 
 
 
 
 
 
 
 
 
dee38d2
 
 
74c8e47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cf60f4
74c8e47
 
 
 
 
 
 
0cf60f4
74c8e47
 
 
0cf60f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
"""
HuggingFace Spaces Gradio App for DeepSeek-OCR-2

Uses lazy loading to avoid startup timeout on free CPU tier.
"""

import os
import sys
import traceback
import time
import threading

import gradio as gr
import torch
from PIL import Image

# Configuration
MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR-2")
MODEL_DTYPE = os.getenv("MODEL_DTYPE", "float16")
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
HF_TOKEN = os.getenv("HF_TOKEN", None)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DTYPE_MAP = {
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
    "float32": torch.float32,
}
TORCH_DTYPE = DTYPE_MAP.get(MODEL_DTYPE, torch.float16)

# Global state for lazy loading
_model = None
_processor = None
_model_lock = threading.Lock()
_loading = False
_load_error = None

print(f"πŸš€ App starting (lazy model loading)")
print(f"πŸ“ Device: {DEVICE}")
print(f"πŸ”’ Dtype: {MODEL_DTYPE}")
print(f"πŸ“¦ Model: {MODEL_NAME}")


def get_model():
    """Lazy load the model on first request."""
    global _model, _processor, _loading, _load_error
    
    if _model is not None:
        return _model, _processor
    
    with _model_lock:
        # Double-check after acquiring lock
        if _model is not None:
            return _model, _processor
        
        if _load_error:
            raise RuntimeError(f"Model failed to load: {_load_error}")
        
        _loading = True
        print(f"⏳ Loading model: {MODEL_NAME}...")
        print(f"   Memory info: {torch.cuda.memory_allocated() if torch.cuda.is_available() else 'CPU mode'}")
        
        try:
            from transformers import AutoModel, AutoProcessor
            
            print("πŸ“¦ Loading processor...")
            processor = AutoProcessor.from_pretrained(
                MODEL_NAME,
                trust_remote_code=True,
                token=HF_TOKEN,
            )
            
            print("🧠 Loading model...")
            model = AutoModel.from_pretrained(
                MODEL_NAME,
                torch_dtype=TORCH_DTYPE,
                trust_remote_code=True,
                low_cpu_mem_usage=True,
                device_map="auto" if torch.cuda.is_available() else None,
                token=HF_TOKEN,
            )
            
            print(f"πŸ“ Moving model to {DEVICE}...")
            if not torch.cuda.is_available():
                model = model.to(DEVICE)
            
            model = model.eval()
            
            _model = model
            _processor = processor
            _loading = False
            print(f"βœ… Model loaded successfully on {DEVICE}")
            return _model, _processor
            
        except Exception as e:
            error_msg = f"{type(e).__name__}: {str(e)}"
            _load_error = error_msg
            _loading = False
            print(f"❌ Failed to load model: {error_msg}", file=sys.stderr)
            traceback.print_exc()
            raise RuntimeError(error_msg)


def run_ocr(image):
    """Process image and return OCR results as text."""
    if image is None:
        return "Error: No image provided"
    
    try:
        print("πŸ”„ OCR request received, loading model...")
        model, processor = get_model()
        print("βœ… Model loaded, processing image...")
    except Exception as e:
        error_msg = f"Error loading model: {str(e)}"
        print(f"❌ {error_msg}")
        
        # Check if it's a memory issue
        if "memory" in str(e).lower() or "cuda" in str(e).lower():
            return f"{error_msg}\n\nπŸ’‘ This appears to be a memory issue. The DeepSeek-OCR-2 model (3B parameters) may be too large for the free CPU tier.\n\nSolutions:\n- Upgrade to GPU hardware (t4-small)\n- Try with smaller images\n- Use the local Docker version instead"
        else:
            return f"{error_msg}\n\nThis may be due to:\n- Network issues downloading the model\n- Temporary HuggingFace Hub issues\n- Hardware limitations\n\nTry again in a few moments or use the local Docker version."
    
    try:
        print("πŸ–ΌοΈ Preprocessing image...")
        # Preprocess
        if image.mode != "RGB":
            image = image.convert("RGB")
        
        w, h = image.size
        print(f"πŸ“ Original size: {w}x{h}")
        
        if max(w, h) > MAX_IMAGE_SIZE:
            scale = MAX_IMAGE_SIZE / max(w, h)
            image = image.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
            print(f"πŸ“ Resized to: {image.size}")
        
        start = time.time()
        print("πŸš€ Running inference...")
        
        # Run inference
        if hasattr(model, 'chat'):
            response = model.chat(
                processor,
                image,
                "Extract all text from this image.",
                history=[],
            )
            text = response if isinstance(response, str) else str(response)
        else:
            inputs = processor(images=image, return_tensors="pt")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            if hasattr(outputs, 'logits'):
                ids = outputs.logits.argmax(-1)
                text = processor.batch_decode(ids, skip_special_tokens=True)[0]
            else:
                text = str(outputs)
        
        elapsed = time.time() - start
        print(f"βœ… Inference completed in {elapsed:.2f}s")
        
        # Build result
        result = f"=== OCR Result ===\n\n{text}\n\n"
        result += f"--- Metadata ---\n"
        result += f"Model: {MODEL_NAME}\n"
        result += f"Device: {DEVICE}\n"
        result += f"Time: {elapsed:.2f}s\n"
        
        return result
        
    except Exception as e:
        error_msg = f"Error during inference: {str(e)}"
        print(f"❌ {error_msg}")
        return f"{error_msg}\n\n{traceback.format_exc()}"


def get_status():
    """Return current status without loading model."""
    lines = [
        "=== DeepSeek-OCR-2 Status ===",
        "",
        f"Model: {MODEL_NAME}",
        f"Device: {DEVICE}",
        f"Dtype: {MODEL_DTYPE}",
        f"CUDA Available: {torch.cuda.is_available()}",
        "",
        f"Model Loaded: {'Yes' if _model is not None else 'No (loads on first request)'}",
    ]
    
    if _loading:
        lines.append("Currently loading model...")
    if _load_error:
        lines.append(f"Error: {_load_error}")
    
    lines.extend([
        "",
        "Note: Model loads on first OCR request to avoid startup timeout.",
        "First request may take 1-2 minutes on CPU.",
    ])
    
    return "\n".join(lines)


# Simple Gradio Interface - disable API docs to avoid schema bugs
demo = gr.Interface(
    fn=run_ocr,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Textbox(label="OCR Result", lines=20),
    title="DeepSeek-OCR-2",
    description=f"Upload an image to extract text. Model: {MODEL_NAME} | Device: {DEVICE}\n\nNote: First request loads the model (~1-2 min on CPU).",
    allow_flagging="never",
    api_name=False  # Disable API to avoid schema generation bug
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)