DeepSeek-OCR-experimental

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 29

Commit

a3f3e60

verified ·

1 Parent(s): e305e72

update app

Browse files

Files changed (1) hide show

app.py +5 -17

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import spaces
 from typing import Iterable
@@ -13,7 +14,6 @@ from docling_core.types.doc import DoclingDocument, DocTagsDocument
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# --- # Device and CUDA Setup Check ---
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("torch.__version__ =", torch.__version__)
 print("torch.version.cuda =", torch.version.cuda)
@@ -33,7 +33,7 @@ colors.steel_blue = colors.Color(
     c200="#A8CCE1",
     c300="#7DB3D2",
     c400="#529AC3",
-    c500="#4682B4",  # SteelBlue base color
     c600="#3E72A0",
     c700="#36638C",
     c800="#2E5378",
@@ -97,8 +97,6 @@ css = """
 }
 """
-# --- 1. Load Model and Tokenizer directly to the correct device ---
 print("Determining device...")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"✅ Using device: {device}")
@@ -107,7 +105,6 @@ print("Loading model and tokenizer...")
 model_name = "strangervisionhf/deepseek-ocr-latest-transformers"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-# Load the model directly to the specified device and set to evaluation mode
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
@@ -115,14 +112,11 @@ model = AutoModel.from_pretrained(
     use_safetensors=True,
 ).to(device).eval() # Move to device and set to eval mode
-# Also apply the desired dtype if using a GPU
 if device.type == 'cuda':
     model = model.to(torch.bfloat16)
 print("✅ Model loaded successfully to device and in eval mode.")
-# --- Helper function to find pre-generated result images ---
 def find_result_image(path):
     for filename in os.listdir(path):
         if "grounding" in filename or "result" in filename:
@@ -133,7 +127,6 @@ def find_result_image(path):
                 print(f"Error opening result image {filename}: {e}")
     return None
-# --- 2. Main Processing Function (Simplified) ---
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
@@ -142,7 +135,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
     if image is None:
         return "Please upload an image first.", None
-    # No need to move the model to GPU here; it's already done at startup.
     print("✅ Model is already on the designated device.")
     with tempfile.TemporaryDirectory() as output_path:
@@ -163,7 +155,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
-        # Configure model size
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -174,7 +165,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         print(f"🏃 Running inference with prompt: {prompt}")
-        # Use the globally defined 'model' which is already on the GPU
         text_result = model.infer(
             tokenizer,
             prompt=prompt,
@@ -190,7 +180,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
         print(f"====\n📄 Text Result: {text_result}\n====")
-        # --- Logic to draw bounding boxes ---
         result_image_pil = None
         pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
         matches = list(pattern.finditer(text_result))
@@ -224,9 +213,11 @@ example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **DeepSeek OCR [exp]**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"],value=example_image)
             model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Large", label="Resolution Size")
             task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"], value="Convert to Markdown", label="Task Type")
             ref_text_input = gr.Textbox(label="Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
@@ -236,14 +227,11 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             output_text = gr.Textbox(label="Output(OCR)", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Layout Detection(If Any)", type="pil")
-    # --- UI Interaction Logic ---
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "Locate Object by Reference" else gr.Textbox(visible=False)
     task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
     submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
-# --- 4. Launch the App ---
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)

 import gradio as gr
 import torch
+import requests
 from transformers import AutoModel, AutoTokenizer
 import spaces
 from typing import Iterable
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("torch.__version__ =", torch.__version__)
 print("torch.version.cuda =", torch.version.cuda)
     c200="#A8CCE1",
     c300="#7DB3D2",
     c400="#529AC3",
+    c500="#4682B4",
     c600="#3E72A0",
     c700="#36638C",
     c800="#2E5378",
 }
 """
 print("Determining device...")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"✅ Using device: {device}")
 model_name = "strangervisionhf/deepseek-ocr-latest-transformers"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     model_name,
     _attn_implementation="flash_attention_2",
     use_safetensors=True,
 ).to(device).eval() # Move to device and set to eval mode
 if device.type == 'cuda':
     model = model.to(torch.bfloat16)
 print("✅ Model loaded successfully to device and in eval mode.")
 def find_result_image(path):
     for filename in os.listdir(path):
         if "grounding" in filename or "result" in filename:
                 print(f"Error opening result image {filename}: {e}")
     return None
 @spaces.GPU
 def process_ocr_task(image, model_size, task_type, ref_text):
     """
     if image is None:
         return "Please upload an image first.", None
     print("✅ Model is already on the designated device.")
     with tempfile.TemporaryDirectory() as output_path:
         temp_image_path = os.path.join(output_path, "temp_image.png")
         image.save(temp_image_path)
         size_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         print(f"🏃 Running inference with prompt: {prompt}")
         text_result = model.infer(
             tokenizer,
             prompt=prompt,
         print(f"====\n📄 Text Result: {text_result}\n====")
         result_image_pil = None
         pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
         matches = list(pattern.finditer(text_result))
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **DeepSeek OCR [exp]**", elem_id="main-title")
+    gr.Markdown("> This app is running with transformers v.4.57.1 and torch v.2.6.0.")
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"], value=example_image)
             model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Large", label="Resolution Size")
             task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"], value="Convert to Markdown", label="Task Type")
             ref_text_input = gr.Textbox(label="Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
             output_text = gr.Textbox(label="Output(OCR)", lines=15, show_copy_button=True)
             output_image = gr.Image(label="Layout Detection(If Any)", type="pil")
     def toggle_ref_text_visibility(task):
         return gr.Textbox(visible=True) if task == "Locate Object by Reference" else gr.Textbox(visible=False)
     task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
     submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)