Spaces:

lsextractor
/

deepseek-ocr2-api

Running

App Files Files Community

deepseek-ocr2-api / app.py

lsextractor

Deploy deepseek-ai/DeepSeek-OCR-2

dee38d2 verified about 1 month ago

raw

history blame contribute delete

7.33 kB

	#!/usr/bin/env python3
	"""
	HuggingFace Spaces Gradio App for DeepSeek-OCR-2

	Uses lazy loading to avoid startup timeout on free CPU tier.
	"""

	import os
	import sys
	import traceback
	import time
	import threading

	import gradio as gr
	import torch
	from PIL import Image

	# Configuration
	MODEL_NAME = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-OCR-2")
	MODEL_DTYPE = os.getenv("MODEL_DTYPE", "float16")
	MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
	HF_TOKEN = os.getenv("HF_TOKEN", None)

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	DTYPE_MAP = {
	"float16": torch.float16,
	"bfloat16": torch.bfloat16,
	"float32": torch.float32,
	}
	TORCH_DTYPE = DTYPE_MAP.get(MODEL_DTYPE, torch.float16)

	# Global state for lazy loading
	_model = None
	_processor = None
	_model_lock = threading.Lock()
	_loading = False
	_load_error = None

	print(f"🚀 App starting (lazy model loading)")
	print(f"📍 Device: {DEVICE}")
	print(f"🔢 Dtype: {MODEL_DTYPE}")
	print(f"📦 Model: {MODEL_NAME}")


	def get_model():
	"""Lazy load the model on first request."""
	global _model, _processor, _loading, _load_error

	if _model is not None:
	return _model, _processor

	with _model_lock:
	# Double-check after acquiring lock
	if _model is not None:
	return _model, _processor

	if _load_error:
	raise RuntimeError(f"Model failed to load: {_load_error}")

	_loading = True
	print(f"⏳ Loading model: {MODEL_NAME}...")
	print(f" Memory info: {torch.cuda.memory_allocated() if torch.cuda.is_available() else 'CPU mode'}")

	try:
	from transformers import AutoModel, AutoProcessor

	print("📦 Loading processor...")
	processor = AutoProcessor.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True,
	token=HF_TOKEN,
	)

	print("🧠 Loading model...")
	model = AutoModel.from_pretrained(
	MODEL_NAME,
	torch_dtype=TORCH_DTYPE,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	device_map="auto" if torch.cuda.is_available() else None,
	token=HF_TOKEN,
	)

	print(f"📍 Moving model to {DEVICE}...")
	if not torch.cuda.is_available():
	model = model.to(DEVICE)

	model = model.eval()

	_model = model
	_processor = processor
	_loading = False
	print(f"✅ Model loaded successfully on {DEVICE}")
	return _model, _processor

	except Exception as e:
	error_msg = f"{type(e).__name__}: {str(e)}"
	_load_error = error_msg
	_loading = False
	print(f"❌ Failed to load model: {error_msg}", file=sys.stderr)
	traceback.print_exc()
	raise RuntimeError(error_msg)


	def run_ocr(image):
	"""Process image and return OCR results as text."""
	if image is None:
	return "Error: No image provided"

	try:
	print("🔄 OCR request received, loading model...")
	model, processor = get_model()
	print("✅ Model loaded, processing image...")
	except Exception as e:
	error_msg = f"Error loading model: {str(e)}"
	print(f"❌ {error_msg}")

	# Check if it's a memory issue
	if "memory" in str(e).lower() or "cuda" in str(e).lower():
	return f"{error_msg}\n\n💡 This appears to be a memory issue. The DeepSeek-OCR-2 model (3B parameters) may be too large for the free CPU tier.\n\nSolutions:\n- Upgrade to GPU hardware (t4-small)\n- Try with smaller images\n- Use the local Docker version instead"
	else:
	return f"{error_msg}\n\nThis may be due to:\n- Network issues downloading the model\n- Temporary HuggingFace Hub issues\n- Hardware limitations\n\nTry again in a few moments or use the local Docker version."

	try:
	print("🖼️ Preprocessing image...")
	# Preprocess
	if image.mode != "RGB":
	image = image.convert("RGB")

	w, h = image.size
	print(f"📏 Original size: {w}x{h}")

	if max(w, h) > MAX_IMAGE_SIZE:
	scale = MAX_IMAGE_SIZE / max(w, h)
	image = image.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
	print(f"📏 Resized to: {image.size}")

	start = time.time()
	print("🚀 Running inference...")

	# Run inference
	if hasattr(model, 'chat'):
	response = model.chat(
	processor,
	image,
	"Extract all text from this image.",
	history=[],
	)
	text = response if isinstance(response, str) else str(response)
	else:
	inputs = processor(images=image, return_tensors="pt")
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)
	if hasattr(outputs, 'logits'):
	ids = outputs.logits.argmax(-1)
	text = processor.batch_decode(ids, skip_special_tokens=True)[0]
	else:
	text = str(outputs)

	elapsed = time.time() - start
	print(f"✅ Inference completed in {elapsed:.2f}s")

	# Build result
	result = f"=== OCR Result ===\n\n{text}\n\n"
	result += f"--- Metadata ---\n"
	result += f"Model: {MODEL_NAME}\n"
	result += f"Device: {DEVICE}\n"
	result += f"Time: {elapsed:.2f}s\n"

	return result

	except Exception as e:
	error_msg = f"Error during inference: {str(e)}"
	print(f"❌ {error_msg}")
	return f"{error_msg}\n\n{traceback.format_exc()}"


	def get_status():
	"""Return current status without loading model."""
	lines = [
	"=== DeepSeek-OCR-2 Status ===",
	"",
	f"Model: {MODEL_NAME}",
	f"Device: {DEVICE}",
	f"Dtype: {MODEL_DTYPE}",
	f"CUDA Available: {torch.cuda.is_available()}",
	"",
	f"Model Loaded: {'Yes' if _model is not None else 'No (loads on first request)'}",
	]

	if _loading:
	lines.append("Currently loading model...")
	if _load_error:
	lines.append(f"Error: {_load_error}")

	lines.extend([
	"",
	"Note: Model loads on first OCR request to avoid startup timeout.",
	"First request may take 1-2 minutes on CPU.",
	])

	return "\n".join(lines)


	# Simple Gradio Interface - disable API docs to avoid schema bugs
	demo = gr.Interface(
	fn=run_ocr,
	inputs=gr.Image(type="pil", label="Upload Image"),
	outputs=gr.Textbox(label="OCR Result", lines=20),
	title="DeepSeek-OCR-2",
	description=f"Upload an image to extract text. Model: {MODEL_NAME} \| Device: {DEVICE}\n\nNote: First request loads the model (~1-2 min on CPU).",
	allow_flagging="never",
	api_name=False # Disable API to avoid schema generation bug
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)