| import gradio as gr |
| import torch |
| from transformers import AutoModel, AutoTokenizer |
| import spaces |
| import os |
| import tempfile |
| from PIL import Image, ImageDraw |
| import re |
|
|
|
|
| |
| |
| |
| import torch |
|
|
| _original_to = torch.Tensor.to |
| _original_half = torch.Tensor.half |
| _original_bf16 = torch.Tensor.bfloat16 |
|
|
| def _patched_to(self, *args, **kwargs): |
| if 'device' in kwargs: |
| dev = str(kwargs['device']) |
| if dev.startswith('cuda'): |
| kwargs['device'] = 'cpu' |
| else: |
| new_args = [] |
| for a in args: |
| if isinstance(a, str) and a.startswith('cuda'): |
| new_args.append('cpu') |
| else: |
| new_args.append(a) |
| args = tuple(new_args) |
|
|
| if 'dtype' in kwargs and kwargs['dtype'] in (torch.bfloat16, torch.float16): |
| kwargs['dtype'] = torch.float32 |
| else: |
| new_args = list(args) |
| for i, a in enumerate(new_args): |
| if isinstance(a, torch.dtype) and a in (torch.bfloat16, torch.float16): |
| new_args[i] = torch.float32 |
| args = tuple(new_args) |
|
|
| return _original_to(self, *args, **kwargs) |
|
|
| torch.Tensor.to = _patched_to |
| torch.Tensor.half = lambda self, *a, **k: self.to(torch.float32) |
| torch.Tensor.bfloat16 = lambda self, *a, **k: self.to(torch.float32) |
| torch.Tensor.cuda = lambda self, *a, **k: self.to("cpu") |
| |
|
|
|
|
| |
| print("Loading model and tokenizer...") |
| model_name = "deepseek-ai/DeepSeek-OCR" |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| model = AutoModel.from_pretrained( |
| model_name, |
| _attn_implementation="eager", |
| trust_remote_code=True, |
| use_safetensors=True, |
| ) |
| model = model.eval() |
| print("β
Model loaded successfully.") |
|
|
| |
| def process_ocr_task(image, model_size, task_type): |
| if image is None: |
| return "Please upload an image first." |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model_cpu = model.to("cpu", dtype=torch.float32) |
|
|
| if device == "cuda": |
| print("β
Model is on GPU.") |
| else: |
| print("β
Model is on CPU.") |
|
|
| with tempfile.TemporaryDirectory() as output_path: |
| if task_type == "π Free OCR": |
| prompt = "<image>\nFree OCR." |
| elif task_type == "π Convert to Markdown": |
| prompt = "<image>\n<|grounding|>Convert the document to markdown." |
| elif task_type == "π Parse Figure": |
| prompt = "<image>\nParse the figure." |
| else: |
| prompt = "<image>\nFree OCR." |
|
|
| temp_image_path = os.path.join(output_path, "temp_image.png") |
| image.save(temp_image_path) |
|
|
| size_configs = { |
| "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, |
| "Small": {"base_size": 640, "image_size": 640, "crop_mode": False}, |
| "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, |
| "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, |
| "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True}, |
| } |
| config = size_configs.get(model_size, size_configs["Gundam (Recommended)"]) |
|
|
| print(f"π Running inference with prompt: {prompt}") |
| text_result = model_cpu.infer( |
| tokenizer, |
| prompt=prompt, |
| image_file=temp_image_path, |
| output_path=output_path, |
| base_size=config["base_size"], |
| image_size=config["image_size"], |
| crop_mode=config["crop_mode"], |
| save_results=True, |
| test_compress=True, |
| eval_mode=True, |
| ) |
|
|
| print(f"====\nπ Text Result: {text_result}\n====") |
|
|
| return text_result |
|
|
| |
| with gr.Blocks(title="DeepSeek-OCR X (t)") as demo: |
| gr.Markdown( |
| """ |
| # DeepSeek-OCR X TUL |
| **π‘ How to use:** |
| 1. **Upload an image** using the upload box. |
| 2. Select a **Resolution**. `Gundam` is recommended for most documents. |
| 3. Choose a **Task Type**: |
| - **π Free OCR**: Extracts raw text from the image. |
| - **π Convert to Markdown**: Converts the document into Markdown, preserving structure. |
| - **π Parse Figure**: Extracts structured data from charts and figures. |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| image_input = gr.Image(type="pil", label="πΌοΈ Upload Image", sources=["upload", "clipboard"]) |
| model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Gundam (Recommended)", label="βοΈ Resolution Size") |
| task_type = gr.Dropdown(choices=["π Free OCR", "π Convert to Markdown", "π Parse Figure"], value="π Free OCR", label="π Task Type") |
| submit_btn = gr.Button("Process Image", variant="primary") |
|
|
| with gr.Column(scale=2): |
| output_text = gr.Textbox(label="π Text Result", lines=15) |
|
|
| submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type], outputs=[output_text]) |
|
|
| |
| if __name__ == "__main__": |
| demo.queue(max_size=20).launch(share=True) |