Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20

Commit

d618111

verified ·

1 Parent(s): 5d80be9

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -28

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -100,7 +101,7 @@ if not os.path.exists(CACHE_PATH):
 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
-    local_dir=os.path.join(CACHE_PATH, 'dots.ocr'),
     max_workers=20,
     local_dir_use_symlinks=False
 )
@@ -159,15 +160,6 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load PaddleOCR
-MODEL_ID_P = "strangervisionhf/paddle"
-processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
-model_p = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_P,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -181,8 +173,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
-    elif model_name == "PaddleOCR":
-        processor, model = processor_p, model_p
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -193,19 +183,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     images = [image.convert("RGB")]
-    # Create the prompt based on the specific model's requirements
-    if model_name == "PaddleOCR":
-        # PaddleOCR's template expects a single string with an image placeholder
-        messages = [
-            {"role": "user", "content": f"<image>\n{text}"}
-        ]
-    else:
-        # Standard format for Nanonets and Dots.OCR
-        messages = [
-            {"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}
-        ]
-    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -253,12 +237,12 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=13, show_copy_button=True)
-            with gr.Accordion("Formatted Result", open=True):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Dots.OCR", "PaddleOCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )

     AutoProcessor,
     TextIteratorStreamer,
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # Download the model files locally
 model_path_d_local = snapshot_download(
     repo_id='rednote-hilab/dots.ocr',
+    local_dir=CACHE_PATH,
     max_workers=20,
     local_dir_use_symlinks=False
 )
     trust_remote_code=True
 ).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
         processor, model = processor_m, model_m
     elif model_name == "Dots.OCR":
         processor, model = processor_d, model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     images = [image.convert("RGB")]
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
+            with gr.Accordion("Formatted Result", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result")
             model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )