Spaces:

prithivMLmods
/

POINTS-Reader-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 11, 2025

Commit

44691c0

verified ·

1 Parent(s): b4d0ff9

update app

Browse files

Files changed (1) hide show

app.py +56 -128

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ import tempfile
 import gradio as gr
 import requests
 import torch
-from PIL import Image, ImageDraw
 import fitz
 import numpy as np
@@ -130,7 +130,7 @@ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: i
 def process_document_stream(
     image: Image.Image,
     prompt_input: str,
-    image_scale_factor: float,
     max_new_tokens: int,
     temperature: float,
     top_p: float,
@@ -138,7 +138,7 @@ def process_document_stream(
     repetition_penalty: float
 ):
     """
-    Main function for standard OCR, handles model inference using tencent/POINTS-Reader.
     """
     if image is None:
         yield "Please upload an image.", ""
@@ -147,135 +147,66 @@ def process_document_stream(
         yield "Please enter a prompt.", ""
         return
     if image_scale_factor > 1.0:
         try:
             original_width, original_height = image.size
             new_width = int(original_width * image_scale_factor)
             new_height = int(original_height * image_scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         except Exception as e:
             print(f"Error during image scaling: {e}")
     temp_image_path = None
     try:
         temp_dir = tempfile.gettempdir()
         temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
         image.save(temp_image_path)
         content = [
             dict(type='image', image=temp_image_path),
             dict(type='text', text=prompt_input)
         ]
-        messages = [{'role': 'user', 'content': content}]
         generation_config = {
-            'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty,
-            'temperature': temperature, 'top_p': top_p, 'top_k': top_k,
             'do_sample': True if temperature > 0 else False
         }
-        response = model.chat(messages, tokenizer, image_processor, generation_config)
         yield response, response
     except Exception as e:
         traceback.print_exc()
         yield f"An error occurred during processing: {str(e)}", ""
     finally:
-        if temp_image_path and os.path.exists(temp_image_path):
-            os.remove(temp_image_path)
-@spaces.GPU
-def extract_text_with_boxes(
-    image: Image.Image,
-    image_scale_factor: float,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    top_k: int,
-    repetition_penalty: float
-):
-    """
-    Processes an image to extract text and bounding boxes, returning the processed text and a visualization.
-    """
-    if image is None:
-        raise gr.Error("Please upload an image first.")
-    original_image = image.copy() # Keep a copy of the original for visualization
-    prompt_for_boxes = "Perform OCR on the image. For each detected line of text, provide its bounding box in the format <box>x_min,y_min,x_max,y_max</box> followed by the text."
-    if image_scale_factor > 1.0:
-        try:
-            original_width, original_height = image.size
-            new_width = int(original_width * image_scale_factor)
-            new_height = int(original_height * image_scale_factor)
-            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-        except Exception as e:
-            print(f"Error during image scaling: {e}")
-    temp_image_path = None
-    try:
-        temp_dir = tempfile.gettempdir()
-        temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
-        image.save(temp_image_path)
-        content = [
-            dict(type='image', image=temp_image_path),
-            dict(type='text', text=prompt_for_boxes)
-        ]
-        messages = [{'role': 'user', 'content': content}]
-        generation_config = {
-            'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty,
-            'temperature': temperature, 'top_p': top_p, 'top_k': top_k,
-            'do_sample': True if temperature > 0 else False
-        }
-        response = model.chat(messages, tokenizer, image_processor, generation_config)
-        # Post-process to extract boxes and draw them
-        original_width, original_height = original_image.size
-        # The model's coordinates are normalized to a 1000x1000 canvas
-        scale_width = original_width / 1000.0
-        scale_height = original_height / 1000.0
-        pattern = r"<box>(\d+,\d+,\d+,\d+)</box>\s*(.*?)\s*(?=<box>|$)"
-        matches = re.findall(pattern, response, re.DOTALL)
-        formatted_output = []
-        vis_image = original_image.copy()
-        draw = ImageDraw.Draw(vis_image)
-        for box_str, text in matches:
-            text = text.strip()
-            if not text:
-                continue
-            try:
-                coords = [int(c.strip()) for c in box_str.split(',')]
-                x0, y0, x1, y1 = coords
-                if x0 >= x1 or y0 >= y1:
-                    continue
-                scaled_poly = [
-                    int(x0 * scale_width), int(y0 * scale_height),
-                    int(x1 * scale_width), int(y0 * scale_height),
-                    int(x1 * scale_width), int(y1 * scale_height),
-                    int(x0 * scale_width), int(y1 * scale_height)
-                ]
-                draw.polygon(scaled_poly, outline="red", width=3)
-                formatted_line = f"{','.join(map(str, scaled_poly))},{text}"
-                formatted_output.append(formatted_line)
-            except Exception:
-                continue
-        return "\n".join(formatted_output), vis_image
-    except Exception as e:
-        traceback.print_exc()
-        return f"An error occurred: {str(e)}", None
-    finally:
         if temp_image_path and os.path.exists(temp_image_path):
             os.remove(temp_image_path)
@@ -303,18 +234,28 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 gr.Textbox(
-                    label="Model in Use ⚡", value="tencent/POINTS-Reader", interactive=False
                 )
                 prompt_input = gr.Textbox(
-                    label="Query Input", placeholder="✦︎ Enter the prompt", value="Perform OCR on the image precisely."
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     image_scale_factor = gr.Slider(
-                        minimum=1.0, maximum=3.0, value=1.0, step=0.1, label="Image Upscale Factor",
                         info="Increases image size before processing. Can improve OCR on small text. Default: 1.0 (no change)."
                     )
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
@@ -334,10 +275,14 @@ def create_gradio_interface():
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
-                        raw_output_stream = gr.Textbox(label="Raw Model Output (max T ≤ 120s)", interactive=False, lines=20, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
-                                examples=["examples/1.jpeg", "examples/2.jpeg", "examples/3.jpeg", "examples/4.jpeg", "examples/5.jpeg"],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
@@ -346,38 +291,21 @@ def create_gradio_interface():
                         with gr.Accordion("(Result.md)", open=True):
                             markdown_output = gr.Markdown()
-                    # --- NEW TAB FOR BOUNDING BOXES ---
-                    with gr.Tab("🖼️ Bounding Boxes"):
-                        ocr_button = gr.Button("Extract Text with Coordinates", variant="primary")
-                        with gr.Row():
-                            ocr_text = gr.Textbox(
-                                label="Extracted Text with Polygon Coordinates", lines=15, show_copy_button=True, scale=1
-                            )
-                            ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)", scale=2)
-                    # --- END NEW TAB ---
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
                         pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
                         pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
         # Event Handlers
-        advanced_settings = [image_scale_factor, max_new_tokens, temperature, top_p, top_k, repetition_penalty]
         def clear_all_outputs():
-            return None, "", "Raw output will appear here.", "", None, None, "", None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[image_input, prompt_input] + advanced_settings,
             outputs=[raw_output_stream, markdown_output]
         )
-        ocr_button.click(
-            fn=extract_text_with_boxes,
-            inputs=[image_input] + advanced_settings,
-            outputs=[ocr_text, ocr_vis]
-        )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
@@ -387,7 +315,7 @@ def create_gradio_interface():
         clear_btn.click(
             clear_all_outputs,
-            outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery, ocr_text, ocr_vis]
         )
     return demo

 import gradio as gr
 import requests
 import torch
+from PIL import Image
 import fitz
 import numpy as np
 def process_document_stream(
     image: Image.Image,
     prompt_input: str,
+    image_scale_factor: float, # New parameter for image scaling
     max_new_tokens: int,
     temperature: float,
     top_p: float,
     repetition_penalty: float
 ):
     """
+    Main function that handles model inference using tencent/POINTS-Reader.
     """
     if image is None:
         yield "Please upload an image.", ""
         yield "Please enter a prompt.", ""
         return
+    # --- IMPLEMENTATION: Image Scaling based on user input ---
     if image_scale_factor > 1.0:
         try:
             original_width, original_height = image.size
             new_width = int(original_width * image_scale_factor)
             new_height = int(original_height * image_scale_factor)
+            print(f"Scaling image from {image.size} to ({new_width}, {new_height}) with factor {image_scale_factor}.")
+            # Use a high-quality resampling filter for better results
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         except Exception as e:
             print(f"Error during image scaling: {e}")
+            # Continue with the original image if scaling fails
+            pass
+    # --- END IMPLEMENTATION ---
     temp_image_path = None
     try:
+        # --- FIX: Save the PIL Image to a temporary file ---
+        # The model expects a file path, not a PIL object.
         temp_dir = tempfile.gettempdir()
         temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
         image.save(temp_image_path)
+        # Prepare content for the model using the temporary file path
         content = [
             dict(type='image', image=temp_image_path),
             dict(type='text', text=prompt_input)
         ]
+        messages = [
+            {
+                'role': 'user',
+                'content': content
+            }
+        ]
+        # Prepare generation configuration from UI inputs
         generation_config = {
+            'max_new_tokens': max_new_tokens,
+            'repetition_penalty': repetition_penalty,
+            'temperature': temperature,
+            'top_p': top_p,
+            'top_k': top_k,
             'do_sample': True if temperature > 0 else False
         }
+        # Run inference
+        response = model.chat(
+            messages,
+            tokenizer,
+            image_processor,
+            generation_config
+        )
+        # Yield the full response at once
         yield response, response
     except Exception as e:
         traceback.print_exc()
         yield f"An error occurred during processing: {str(e)}", ""
     finally:
+        # --- Clean up the temporary image file ---
         if temp_image_path and os.path.exists(temp_image_path):
             os.remove(temp_image_path)
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 gr.Textbox(
+                    label="Model in Use ⚡",
+                    value="tencent/POINTS-Reader",
+                    interactive=False
                 )
                 prompt_input = gr.Textbox(
+                    label="Query Input",
+                    placeholder="✦︎ Enter the prompt",
+                    value="Perform OCR on the image precisely.",
                 )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
+                    # --- NEW UI ELEMENT: Image Scaling Slider ---
                     image_scale_factor = gr.Slider(
+                        minimum=1.0,
+                        maximum=3.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Image Upscale Factor",
                         info="Increases image size before processing. Can improve OCR on small text. Default: 1.0 (no change)."
                     )
+                    # --- END NEW UI ELEMENT ---
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
+                        raw_output_stream = gr.Textbox(label="Raw Model Output (max T ≤ 120s)", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
+                                examples=["examples/1.jpeg",
+                                          "examples/2.jpeg",
+                                          "examples/3.jpeg",
+                                          "examples/4.jpeg",
+                                          "examples/5.jpeg"],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
                         with gr.Accordion("(Result.md)", open=True):
                             markdown_output = gr.Markdown()
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
                         pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
                         pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
         # Event Handlers
         def clear_all_outputs():
+            return None, "", "Raw output will appear here.", "", None, None
         process_btn.click(
             fn=process_document_stream,
+            # --- UPDATE: Add the new slider to the inputs list ---
+            inputs=[image_input, prompt_input, image_scale_factor, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
             outputs=[raw_output_stream, markdown_output]
         )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
         clear_btn.click(
             clear_all_outputs,
+            outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
         )
     return demo