Spaces:

triflix
/

deepseek

Sleeping

App Files Files Community

deepseek / app.py

triflix

Update app.py

27a7097 verified about 2 months ago

raw

history blame contribute delete

5.47 kB

	import gradio as gr
	import torch
	from transformers import AutoModel, AutoTokenizer
	import spaces
	import os
	import tempfile
	from PIL import Image, ImageDraw
	import re


	# --------------------------------------------------------------
	# Advanced monkey‑patch: force CPU + float32 everywhere
	# --------------------------------------------------------------
	import torch

	_original_to = torch.Tensor.to
	_original_half = torch.Tensor.half
	_original_bf16 = torch.Tensor.bfloat16

	def _patched_to(self, args, *kwargs):
	if 'device' in kwargs:
	dev = str(kwargs['device'])
	if dev.startswith('cuda'):
	kwargs['device'] = 'cpu'
	else:
	new_args = []
	for a in args:
	if isinstance(a, str) and a.startswith('cuda'):
	new_args.append('cpu')
	else:
	new_args.append(a)
	args = tuple(new_args)

	if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16):
	kwargs['dtype'] = torch.float32
	else:
	new_args = list(args)
	for i, a in enumerate(new_args):
	if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16):
	new_args[i] = torch.float32
	args = tuple(new_args)

	return _original_to(self, args, *kwargs)

	torch.Tensor.to = _patched_to
	torch.Tensor.half = lambda self, a, *k: self.to(torch.float32)
	torch.Tensor.bfloat16 = lambda self, a, *k: self.to(torch.float32)
	torch.Tensor.cuda = lambda self, a, *k: self.to("cpu")
	# --- End Monkey-Patch ---


	# --- 1. Load Model and Tokenizer ---
	print("Loading model and tokenizer...")
	model_name = "deepseek-ai/DeepSeek-OCR"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	model_name,
	_attn_implementation="eager",
	trust_remote_code=True,
	use_safetensors=True,
	)
	model = model.eval()
	print("✅ Model loaded successfully.")

	# --- 2. Main Processing Function ---
	def process_ocr_task(image, model_size, task_type):
	if image is None:
	return "Please upload an image first."

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model_cpu = model.to("cpu", dtype=torch.float32)

	if device == "cuda":
	print("✅ Model is on GPU.")
	else:
	print("✅ Model is on CPU.")

	with tempfile.TemporaryDirectory() as output_path:
	if task_type == "📝 Free OCR":
	prompt = "<image>\nFree OCR."
	elif task_type == "📄 Convert to Markdown":
	prompt = "<image>\n<\|grounding\|>Convert the document to markdown."
	elif task_type == "📈 Parse Figure":
	prompt = "<image>\nParse the figure."
	else:
	prompt = "<image>\nFree OCR."

	temp_image_path = os.path.join(output_path, "temp_image.png")
	image.save(temp_image_path)

	size_configs = {
	"Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
	"Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
	"Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
	"Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
	"Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
	}
	config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])

	print(f"🏃 Running inference with prompt: {prompt}")
	text_result = model_cpu.infer(
	tokenizer,
	prompt=prompt,
	image_file=temp_image_path,
	output_path=output_path,
	base_size=config["base_size"],
	image_size=config["image_size"],
	crop_mode=config["crop_mode"],
	save_results=True,
	test_compress=True,
	eval_mode=True,
	)

	print(f"====\n📄 Text Result: {text_result}\n====")

	return text_result

	# --- 3. Build the Gradio Interface ---
	with gr.Blocks(title="DeepSeek-OCR X (t)") as demo:
	gr.Markdown(
	"""
	# DeepSeek-OCR X TUL
	💡 How to use:
	1. Upload an image using the upload box.
	2. Select a Resolution. `Gundam` is recommended for most documents.
	3. Choose a Task Type:
	- 📝 Free OCR: Extracts raw text from the image.
	- 📄 Convert to Markdown: Converts the document into Markdown, preserving structure.
	- 📈 Parse Figure: Extracts structured data from charts and figures.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
	model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="⚙️ Resolution Size")
	task_type = gr.Dropdown(choices=["📝 Free OCR", "📄 Convert to Markdown", "📈 Parse Figure"], value="📝 Free OCR", label="🚀 Task Type")
	submit_btn = gr.Button("Process Image", variant="primary")

	with gr.Column(scale=2):
	output_text = gr.Textbox(label="📄 Text Result", lines=15)

	submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text])

	# --- 4. Launch the App ---
	if __name__ == "__main__":
	demo.queue(max_size=20).launch(share=True)