Dots-OCR

Running on Zero

App Files Files Community

yahtzee commited on about 1 month ago

Commit

94c64d2

1 Parent(s): 4a015a8

allow custom prompts

Browse files

Files changed (1) hide show

app.py +15 -8

app.py CHANGED Viewed

@@ -352,7 +352,7 @@ pdf_cache = {
     "results": []
 }
 @spaces.GPU()
-def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
     """Run inference on an image with the given prompt"""
     try:
         if model is None or processor is None:
@@ -367,7 +367,7 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
                         "type": "image",
                         "image": image
                     },
-                    {"type": "text", "text": prompt}
                 ]
             }
         ]
@@ -425,7 +425,9 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
 def process_image(
     image: Image.Image,
     min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None
 ) -> Dict[str, Any]:
     """Process a single image with the specified prompt mode"""
     try:
@@ -434,7 +436,7 @@ def process_image(
             image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
         # Run inference with the default prompt
-        raw_output = inference(image, prompt)
         # Process results based on prompt mode
         result = {
@@ -685,6 +687,7 @@ def create_gradio_interface():
                 # Advanced settings
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(
                         minimum=1000,
                         maximum=32000,
@@ -744,7 +747,7 @@ def create_gradio_interface():
                         )
         # Event handlers
-        def process_document(file_path, max_tokens, min_pix, max_pix):
             """Process the uploaded document"""
             global pdf_cache
@@ -770,7 +773,9 @@ def create_gradio_interface():
                         result = process_image(
                             img,
                             min_pixels=int(min_pix) if min_pix else None,
-                            max_pixels=int(max_pix) if max_pix else None
                         )
                         all_results.append(result)
                         if result.get('markdown_content'):
@@ -799,7 +804,9 @@ def create_gradio_interface():
                     result = process_image(
                         image,
                         min_pixels=int(min_pix) if min_pix else None,
-                        max_pixels=int(max_pix) if max_pix else None
                     )
                     pdf_cache["results"] = [result]
@@ -875,7 +882,7 @@ def create_gradio_interface():
         process_btn.click(
             process_document,
-            inputs=[file_input, max_new_tokens, min_pixels, max_pixels],
             outputs=[processed_image, markdown_output, json_output]
         )

     "results": []
 }
 @spaces.GPU()
+def inference(image: Image.Image, max_new_tokens: int = 24000, custom_prompt: str = '') -> str:
     """Run inference on an image with the given prompt"""
     try:
         if model is None or processor is None:
                         "type": "image",
                         "image": image
                     },
+                    {"type": "text", "text": custom_prompt}
                 ]
             }
         ]
 def process_image(
     image: Image.Image,
     min_pixels: Optional[int] = None,
+    max_pixels: Optional[int] = None,
+    custom_prompt: Optional[str] = None,
+    max_new_tokens: int = 24000,
 ) -> Dict[str, Any]:
     """Process a single image with the specified prompt mode"""
     try:
             image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
         # Run inference with the default prompt
+        raw_output = inference(image=image, custom_prompt=custom_prompt, max_new_tokens=max_new_tokens)
         # Process results based on prompt mode
         result = {
                 # Advanced settings
                 with gr.Accordion("Advanced Settings", open=False):
+                    custom_prompt = gr.Textbox(label="Custom Prompt", value=prompt, lines=12, placeholder="Enter a custom prompt...", info="Modify the OCR / layout extraction prompt.")
                     max_new_tokens = gr.Slider(
                         minimum=1000,
                         maximum=32000,
                         )
         # Event handlers
+        def process_document(file_path, max_tokens, min_pix, max_pix, custom_prompt):
             """Process the uploaded document"""
             global pdf_cache
                         result = process_image(
                             img,
                             min_pixels=int(min_pix) if min_pix else None,
+                            max_pixels=int(max_pix) if max_pix else None,
+                            custom_prompt=custom_prompt,
+                            max_new_tokens=max_tokens
                         )
                         all_results.append(result)
                         if result.get('markdown_content'):
                     result = process_image(
                         image,
                         min_pixels=int(min_pix) if min_pix else None,
+                        max_pixels=int(max_pix) if max_pix else None,
+                        custom_prompt=custom_prompt,
+                        max_new_tokens=max_tokens
                     )
                     pdf_cache["results"] = [result]
         process_btn.click(
             process_document,
+            inputs=[file_input, max_new_tokens, min_pixels, max_pixels, custom_prompt],
             outputs=[processed_image, markdown_output, json_output]
         )