Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| import spaces | |
| import os | |
| import sys | |
| import tempfile | |
| import shutil | |
| from PIL import Image, ImageDraw, ImageFont, ImageOps | |
| import fitz | |
| import re | |
| import numpy as np | |
| import base64 | |
| from io import StringIO, BytesIO | |
| # ===================== | |
| # MODEL SETUP | |
| # ===================== | |
| MODEL_NAME = "deepseek-ai/DeepSeek-OCR" | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True | |
| ) | |
| model = AutoModel.from_pretrained( | |
| MODEL_NAME, | |
| _attn_implementation="flash_attention_2", | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| use_safetensors=True | |
| ) | |
| model = model.eval().cuda() | |
| # ===================== | |
| # CONFIGS | |
| # ===================== | |
| MODEL_CONFIGS = { | |
| "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True}, | |
| } | |
| TASK_PROMPTS = { | |
| "π Free OCR": { | |
| "prompt": "<image>\nFree OCR.", | |
| "has_grounding": False | |
| } | |
| } | |
| # ===================== | |
| # OCR CORE | |
| # ===================== | |
| def clean_output(text): | |
| if not text: | |
| return "" | |
| return text.strip() | |
| def process_image(image): | |
| if image is None: | |
| return "Error: No image provided", "", "", None, [] | |
| if image.mode in ("RGBA", "LA", "P"): | |
| image = image.convert("RGB") | |
| image = ImageOps.exif_transpose(image) | |
| config = MODEL_CONFIGS["Gundam"] | |
| prompt = TASK_PROMPTS["π Free OCR"]["prompt"] | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") | |
| image.save(tmp.name, "JPEG", quality=95) | |
| tmp.close() | |
| out_dir = tempfile.mkdtemp() | |
| stdout = sys.stdout | |
| sys.stdout = StringIO() | |
| model.infer( | |
| tokenizer=tokenizer, | |
| prompt=prompt, | |
| image_file=tmp.name, | |
| output_path=out_dir, | |
| base_size=config["base_size"], | |
| image_size=config["image_size"], | |
| crop_mode=config["crop_mode"] | |
| ) | |
| result = "\n".join( | |
| line for line in sys.stdout.getvalue().split("\n") | |
| if not any( | |
| s in line | |
| for s in ["image:", "other:", "PATCHES", "====", "BASE:", "%|", "torch.Size"] | |
| ) | |
| ).strip() | |
| sys.stdout = stdout | |
| os.unlink(tmp.name) | |
| shutil.rmtree(out_dir, ignore_errors=True) | |
| if not result: | |
| return "No text detected", "", "", None, [] | |
| cleaned = clean_output(result) | |
| return cleaned, "", result, None, [] | |
| def process_pdf(path, page_num): | |
| doc = fitz.open(path) | |
| total_pages = len(doc) | |
| if page_num < 1 or page_num > total_pages: | |
| doc.close() | |
| return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, [] | |
| page = doc.load_page(page_num - 1) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False) | |
| img = Image.open(BytesIO(pix.tobytes("png"))) | |
| doc.close() | |
| return process_image(img) | |
| def process_file(path, page_num): | |
| if not path: | |
| return "Error: No file uploaded", "", "", None, [] | |
| if path.lower().endswith(".pdf"): | |
| return process_pdf(path, page_num) | |
| else: | |
| return process_image(Image.open(path)) | |
| # ===================== | |
| # PDF HELPERS | |
| # ===================== | |
| def get_pdf_page_count(file_path): | |
| if not file_path or not file_path.lower().endswith(".pdf"): | |
| return 1 | |
| doc = fitz.open(file_path) | |
| count = len(doc) | |
| doc.close() | |
| return count | |
| def load_image(file_path, page_num=1): | |
| if not file_path: | |
| return None | |
| if file_path.lower().endswith(".pdf"): | |
| doc = fitz.open(file_path) | |
| page_idx = max(0, min(int(page_num) - 1, len(doc) - 1)) | |
| page = doc.load_page(page_idx) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False) | |
| img = Image.open(BytesIO(pix.tobytes("png"))) | |
| doc.close() | |
| return img | |
| return Image.open(file_path) | |
| def update_page_selector(file_path): | |
| if not file_path: | |
| return gr.update(visible=False) | |
| if file_path.lower().endswith(".pdf"): | |
| page_count = get_pdf_page_count(file_path) | |
| return gr.update( | |
| visible=True, | |
| maximum=page_count, | |
| minimum=1, | |
| value=1, | |
| label=f"Select Page (1β{page_count})" | |
| ) | |
| return gr.update(visible=False) | |
| # ===================== | |
| # UI | |
| # ===================== | |
| with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek OCR β Free OCR") as demo: | |
| gr.Markdown(""" | |
| # OCR | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_in = gr.File( | |
| label="Upload Image or PDF", | |
| file_types=["image", ".pdf"], | |
| type="filepath" | |
| ) | |
| input_img = gr.Image( | |
| label="Preview", | |
| type="pil", | |
| height=300 | |
| ) | |
| page_selector = gr.Number( | |
| label="Select Page", | |
| value=1, | |
| minimum=1, | |
| step=1, | |
| visible=False | |
| ) | |
| # Hardcoded + locked | |
| mode = gr.Dropdown( | |
| ["Gundam"], | |
| value="Gundam", | |
| label="Mode", | |
| interactive=False | |
| ) | |
| task = gr.Dropdown( | |
| ["π Free OCR"], | |
| value="π Free OCR", | |
| label="Task", | |
| interactive=False | |
| ) | |
| prompt = gr.Textbox(visible=False) | |
| btn = gr.Button("Extract OCR", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.Tab("Text"): | |
| text_out = gr.Textbox(lines=20) | |
| with gr.Tab("Raw Output"): | |
| raw_out = gr.Textbox(lines=20) | |
| # ===================== | |
| # EVENTS | |
| # ===================== | |
| file_in.change(load_image, [file_in, page_selector], [input_img]) | |
| file_in.change(update_page_selector, [file_in], [page_selector]) | |
| page_selector.change(load_image, [file_in, page_selector], [input_img]) | |
| def run(image, file_path, page_num): | |
| if file_path: | |
| return process_file(file_path, int(page_num)) | |
| if image is not None: | |
| return process_image(image) | |
| return "Error", "", "", None, [] | |
| btn.click( | |
| run, | |
| [input_img, file_in, page_selector], | |
| [text_out, gr.State(), raw_out, gr.State(), gr.State()] | |
| ) | |
| # ===================== | |
| # LAUNCH | |
| # ===================== | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20).launch() | |