Spaces:
Running
Running
File size: 7,328 Bytes
74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 dee38d2 74c8e47 0cf60f4 74c8e47 0cf60f4 74c8e47 0cf60f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | #!/usr/bin/env python3
"""
HuggingFace Spaces Gradio App for DeepSeek-OCR-2
Uses lazy loading to avoid startup timeout on free CPU tier.
"""
import os
import sys
import traceback
import time
import threading
import gradio as gr
import torch
from PIL import Image
# Configuration
MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR-2")
MODEL_DTYPE = os.getenv("MODEL_DTYPE", "float16")
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
HF_TOKEN = os.getenv("HF_TOKEN", None)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE_MAP = {
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"float32": torch.float32,
}
TORCH_DTYPE = DTYPE_MAP.get(MODEL_DTYPE, torch.float16)
# Global state for lazy loading
_model = None
_processor = None
_model_lock = threading.Lock()
_loading = False
_load_error = None
print(f"π App starting (lazy model loading)")
print(f"π Device: {DEVICE}")
print(f"π’ Dtype: {MODEL_DTYPE}")
print(f"π¦ Model: {MODEL_NAME}")
def get_model():
"""Lazy load the model on first request."""
global _model, _processor, _loading, _load_error
if _model is not None:
return _model, _processor
with _model_lock:
# Double-check after acquiring lock
if _model is not None:
return _model, _processor
if _load_error:
raise RuntimeError(f"Model failed to load: {_load_error}")
_loading = True
print(f"β³ Loading model: {MODEL_NAME}...")
print(f" Memory info: {torch.cuda.memory_allocated() if torch.cuda.is_available() else 'CPU mode'}")
try:
from transformers import AutoModel, AutoProcessor
print("π¦ Loading processor...")
processor = AutoProcessor.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
token=HF_TOKEN,
)
print("π§ Loading model...")
model = AutoModel.from_pretrained(
MODEL_NAME,
torch_dtype=TORCH_DTYPE,
trust_remote_code=True,
low_cpu_mem_usage=True,
device_map="auto" if torch.cuda.is_available() else None,
token=HF_TOKEN,
)
print(f"π Moving model to {DEVICE}...")
if not torch.cuda.is_available():
model = model.to(DEVICE)
model = model.eval()
_model = model
_processor = processor
_loading = False
print(f"β
Model loaded successfully on {DEVICE}")
return _model, _processor
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
_load_error = error_msg
_loading = False
print(f"β Failed to load model: {error_msg}", file=sys.stderr)
traceback.print_exc()
raise RuntimeError(error_msg)
def run_ocr(image):
"""Process image and return OCR results as text."""
if image is None:
return "Error: No image provided"
try:
print("π OCR request received, loading model...")
model, processor = get_model()
print("β
Model loaded, processing image...")
except Exception as e:
error_msg = f"Error loading model: {str(e)}"
print(f"β {error_msg}")
# Check if it's a memory issue
if "memory" in str(e).lower() or "cuda" in str(e).lower():
return f"{error_msg}\n\nπ‘ This appears to be a memory issue. The DeepSeek-OCR-2 model (3B parameters) may be too large for the free CPU tier.\n\nSolutions:\n- Upgrade to GPU hardware (t4-small)\n- Try with smaller images\n- Use the local Docker version instead"
else:
return f"{error_msg}\n\nThis may be due to:\n- Network issues downloading the model\n- Temporary HuggingFace Hub issues\n- Hardware limitations\n\nTry again in a few moments or use the local Docker version."
try:
print("πΌοΈ Preprocessing image...")
# Preprocess
if image.mode != "RGB":
image = image.convert("RGB")
w, h = image.size
print(f"π Original size: {w}x{h}")
if max(w, h) > MAX_IMAGE_SIZE:
scale = MAX_IMAGE_SIZE / max(w, h)
image = image.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
print(f"π Resized to: {image.size}")
start = time.time()
print("π Running inference...")
# Run inference
if hasattr(model, 'chat'):
response = model.chat(
processor,
image,
"Extract all text from this image.",
history=[],
)
text = response if isinstance(response, str) else str(response)
else:
inputs = processor(images=image, return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
if hasattr(outputs, 'logits'):
ids = outputs.logits.argmax(-1)
text = processor.batch_decode(ids, skip_special_tokens=True)[0]
else:
text = str(outputs)
elapsed = time.time() - start
print(f"β
Inference completed in {elapsed:.2f}s")
# Build result
result = f"=== OCR Result ===\n\n{text}\n\n"
result += f"--- Metadata ---\n"
result += f"Model: {MODEL_NAME}\n"
result += f"Device: {DEVICE}\n"
result += f"Time: {elapsed:.2f}s\n"
return result
except Exception as e:
error_msg = f"Error during inference: {str(e)}"
print(f"β {error_msg}")
return f"{error_msg}\n\n{traceback.format_exc()}"
def get_status():
"""Return current status without loading model."""
lines = [
"=== DeepSeek-OCR-2 Status ===",
"",
f"Model: {MODEL_NAME}",
f"Device: {DEVICE}",
f"Dtype: {MODEL_DTYPE}",
f"CUDA Available: {torch.cuda.is_available()}",
"",
f"Model Loaded: {'Yes' if _model is not None else 'No (loads on first request)'}",
]
if _loading:
lines.append("Currently loading model...")
if _load_error:
lines.append(f"Error: {_load_error}")
lines.extend([
"",
"Note: Model loads on first OCR request to avoid startup timeout.",
"First request may take 1-2 minutes on CPU.",
])
return "\n".join(lines)
# Simple Gradio Interface - disable API docs to avoid schema bugs
demo = gr.Interface(
fn=run_ocr,
inputs=gr.Image(type="pil", label="Upload Image"),
outputs=gr.Textbox(label="OCR Result", lines=20),
title="DeepSeek-OCR-2",
description=f"Upload an image to extract text. Model: {MODEL_NAME} | Device: {DEVICE}\n\nNote: First request loads the model (~1-2 min on CPU).",
allow_flagging="never",
api_name=False # Disable API to avoid schema generation bug
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
|