Spaces:

prithivMLmods
/

POINTS-Reader-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 11, 2025

Commit

b4d0ff9

verified ·

1 Parent(s): 1b38818

update app

Browse files

Files changed (1) hide show

app.py +99 -62

app.py CHANGED Viewed

@@ -138,7 +138,7 @@ def process_document_stream(
     repetition_penalty: float
 ):
     """
-    Main function that handles model inference for general OCR.
     """
     if image is None:
         yield "Please upload an image.", ""
@@ -152,11 +152,9 @@ def process_document_stream(
             original_width, original_height = image.size
             new_width = int(original_width * image_scale_factor)
             new_height = int(original_height * image_scale_factor)
-            print(f"Scaling image from {image.size} to ({new_width}, {new_height}) with factor {image_scale_factor}.")
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         except Exception as e:
             print(f"Error during image scaling: {e}")
-            pass
     temp_image_path = None
     try:
@@ -171,11 +169,8 @@ def process_document_stream(
         messages = [{'role': 'user', 'content': content}]
         generation_config = {
-            'max_new_tokens': max_new_tokens,
-            'repetition_penalty': repetition_penalty,
-            'temperature': temperature,
-            'top_p': top_p,
-            'top_k': top_k,
             'do_sample': True if temperature > 0 else False
         }
@@ -189,63 +184,93 @@ def process_document_stream(
         if temp_image_path and os.path.exists(temp_image_path):
             os.remove(temp_image_path)
-# --- Bounding Box Extraction Logic ---
 @spaces.GPU
-def extract_text_with_coordinates(image: Image.Image):
     """
-    Runs the model with a specific prompt to get OCR and bounding boxes,
-    then processes the output to create a visualization.
     """
     if image is None:
-        raise gr.Error("Please upload an image first in the main tab.")
-    prompt = "Please perform OCR on the image and provide the bounding box for each recognized text line. The format should be 'text<box>x1, y1, x2, y2</box>'."
     temp_image_path = None
     try:
         temp_dir = tempfile.gettempdir()
         temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
         image.save(temp_image_path)
-        content = [dict(type='image', image=temp_image_path), dict(type='text', text=prompt)]
         messages = [{'role': 'user', 'content': content}]
-        generation_config = {'max_new_tokens': 4096}
         response = model.chat(messages, tokenizer, image_processor, generation_config)
-        original_width, original_height = image.size
-        # Regex to find coordinates inside <box> tags
-        pattern_coords = r"<box>(\d+,\s*\d+,\s*\d+,\s*\d+)</box>"
-        # Regex to split the string by the full box tag to isolate text
-        pattern_splitter = r"<box>\d+,\s*\d+,\s*\d+,\s*\d+</box>"
-        bboxs_raw = re.findall(pattern_coords, response)
-        lines = [line.strip() for line in re.split(pattern_splitter, response) if line.strip()]
-        num_items = min(len(lines), len(bboxs_raw))
-        vis_image = image.copy()
         draw = ImageDraw.Draw(vis_image)
-        output_text = ""
-        for i in range(num_items):
-            line_text = lines[i]
-            box_coords = [int(c.strip()) for c in bboxs_raw[i].split(',')]
-            if len(box_coords) == 4:
-                x0, y0, x1, y1 = box_coords
-                # Scale coordinates from the model's 1000px basis to the original image size
-                x0_s = int(x0 * original_width / 1000)
-                y0_s = int(y0 * original_height / 1000)
-                x1_s = int(x1 * original_width / 1000)
-                y1_s = int(y1 * original_height / 1000)
-                draw.rectangle([x0_s, y0_s, x1_s, y1_s], outline="red", width=2)
-                # Format output as a polygon (quadrilateral) and the extracted text
-                output_text += f"{x0_s},{y0_s},{x1_s},{y0_s},{x1_s},{y1_s},{x0_s},{y1_s},{line_text}\n"
-        return output_text.strip(), vis_image
     except Exception as e:
         traceback.print_exc()
@@ -277,12 +302,19 @@ def create_gradio_interface():
         with gr.Row():
             # Left Column (Inputs)
             with gr.Column(scale=1):
-                gr.Textbox(label="Model in Use ⚡", value="tencent/POINTS-Reader", interactive=False)
-                prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt", value="Perform OCR on the image precisely.")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
-                    image_scale_factor = gr.Slider(minimum=1.0, maximum=3.0, value=1.0, step=0.1, label="Image Upscale Factor", info="Increases image size before processing. Can improve OCR on small text.")
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
@@ -302,23 +334,27 @@ def create_gradio_interface():
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
-                        raw_output_stream = gr.Textbox(label="Raw Model Output (max T ≤ 120s)", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
-                            examples = gr.Examples(examples=["examples/1.jpeg", "examples/2.jpeg", "examples/3.jpeg", "examples/4.jpeg", "examples/5.jpeg"], inputs=image_input, label="Examples")
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
                             markdown_output = gr.Markdown()
-                    with gr.Tab("Bounding Boxes"):
-                        gr.Markdown("Click the button to extract text and visualize its location on the image. This uses a specialized prompt to get coordinates from the model.")
                         with gr.Row():
-                            with gr.Column(scale=1):
-                                ocr_button = gr.Button("🔍 Extract Text with Coordinates", variant="primary")
-                                ocr_text = gr.Textbox(label="Extracted Text with Coordinates", info="Format: x1,y1,x2,y2,x3,y3,x4,y4,text", lines=15, show_copy_button=True)
-                            with gr.Column(scale=1):
-                                ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
@@ -326,22 +362,23 @@ def create_gradio_interface():
                         pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
         # Event Handlers
         def clear_all_outputs():
-            # Clear all input and output fields across all tabs
             return None, "", "Raw output will appear here.", "", None, None, "", None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[image_input, prompt_input, image_scale_factor, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
             outputs=[raw_output_stream, markdown_output]
         )
         ocr_button.click(
-            fn=extract_text_with_coordinates,
-            inputs=[image_input],
             outputs=[ocr_text, ocr_vis]
         )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
             inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],

     repetition_penalty: float
 ):
     """
+    Main function for standard OCR, handles model inference using tencent/POINTS-Reader.
     """
     if image is None:
         yield "Please upload an image.", ""
             original_width, original_height = image.size
             new_width = int(original_width * image_scale_factor)
             new_height = int(original_height * image_scale_factor)
             image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         except Exception as e:
             print(f"Error during image scaling: {e}")
     temp_image_path = None
     try:
         messages = [{'role': 'user', 'content': content}]
         generation_config = {
+            'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty,
+            'temperature': temperature, 'top_p': top_p, 'top_k': top_k,
             'do_sample': True if temperature > 0 else False
         }
         if temp_image_path and os.path.exists(temp_image_path):
             os.remove(temp_image_path)
 @spaces.GPU
+def extract_text_with_boxes(
+    image: Image.Image,
+    image_scale_factor: float,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float
+):
     """
+    Processes an image to extract text and bounding boxes, returning the processed text and a visualization.
     """
     if image is None:
+        raise gr.Error("Please upload an image first.")
+    original_image = image.copy() # Keep a copy of the original for visualization
+    prompt_for_boxes = "Perform OCR on the image. For each detected line of text, provide its bounding box in the format <box>x_min,y_min,x_max,y_max</box> followed by the text."
+    if image_scale_factor > 1.0:
+        try:
+            original_width, original_height = image.size
+            new_width = int(original_width * image_scale_factor)
+            new_height = int(original_height * image_scale_factor)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        except Exception as e:
+            print(f"Error during image scaling: {e}")
     temp_image_path = None
     try:
         temp_dir = tempfile.gettempdir()
         temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
         image.save(temp_image_path)
+        content = [
+            dict(type='image', image=temp_image_path),
+            dict(type='text', text=prompt_for_boxes)
+        ]
         messages = [{'role': 'user', 'content': content}]
+        generation_config = {
+            'max_new_tokens': max_new_tokens, 'repetition_penalty': repetition_penalty,
+            'temperature': temperature, 'top_p': top_p, 'top_k': top_k,
+            'do_sample': True if temperature > 0 else False
+        }
         response = model.chat(messages, tokenizer, image_processor, generation_config)
+        # Post-process to extract boxes and draw them
+        original_width, original_height = original_image.size
+        # The model's coordinates are normalized to a 1000x1000 canvas
+        scale_width = original_width / 1000.0
+        scale_height = original_height / 1000.0
+        pattern = r"<box>(\d+,\d+,\d+,\d+)</box>\s*(.*?)\s*(?=<box>|$)"
+        matches = re.findall(pattern, response, re.DOTALL)
+        formatted_output = []
+        vis_image = original_image.copy()
         draw = ImageDraw.Draw(vis_image)
+        for box_str, text in matches:
+            text = text.strip()
+            if not text:
+                continue
+            try:
+                coords = [int(c.strip()) for c in box_str.split(',')]
+                x0, y0, x1, y1 = coords
+                if x0 >= x1 or y0 >= y1:
+                    continue
+                scaled_poly = [
+                    int(x0 * scale_width), int(y0 * scale_height),
+                    int(x1 * scale_width), int(y0 * scale_height),
+                    int(x1 * scale_width), int(y1 * scale_height),
+                    int(x0 * scale_width), int(y1 * scale_height)
+                ]
+                draw.polygon(scaled_poly, outline="red", width=3)
+                formatted_line = f"{','.join(map(str, scaled_poly))},{text}"
+                formatted_output.append(formatted_line)
+            except Exception:
+                continue
+        return "\n".join(formatted_output), vis_image
     except Exception as e:
         traceback.print_exc()
         with gr.Row():
             # Left Column (Inputs)
             with gr.Column(scale=1):
+                gr.Textbox(
+                    label="Model in Use ⚡", value="tencent/POINTS-Reader", interactive=False
+                )
+                prompt_input = gr.Textbox(
+                    label="Query Input", placeholder="✦︎ Enter the prompt", value="Perform OCR on the image precisely."
+                )
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
+                    image_scale_factor = gr.Slider(
+                        minimum=1.0, maximum=3.0, value=1.0, step=0.1, label="Image Upscale Factor",
+                        info="Increases image size before processing. Can improve OCR on small text. Default: 1.0 (no change)."
+                    )
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
             with gr.Column(scale=2):
                 with gr.Tabs() as tabs:
                     with gr.Tab("📝 Extracted Content"):
+                        raw_output_stream = gr.Textbox(label="Raw Model Output (max T ≤ 120s)", interactive=False, lines=20, show_copy_button=True)
                         with gr.Row():
+                            examples = gr.Examples(
+                                examples=["examples/1.jpeg", "examples/2.jpeg", "examples/3.jpeg", "examples/4.jpeg", "examples/5.jpeg"],
+                                inputs=image_input, label="Examples"
+                            )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
                             markdown_output = gr.Markdown()
+                    # --- NEW TAB FOR BOUNDING BOXES ---
+                    with gr.Tab("🖼️ Bounding Boxes"):
+                        ocr_button = gr.Button("Extract Text with Coordinates", variant="primary")
                         with gr.Row():
+                            ocr_text = gr.Textbox(
+                                label="Extracted Text with Polygon Coordinates", lines=15, show_copy_button=True, scale=1
+                            )
+                            ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)", scale=2)
+                    # --- END NEW TAB ---
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
                         pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
         # Event Handlers
+        advanced_settings = [image_scale_factor, max_new_tokens, temperature, top_p, top_k, repetition_penalty]
         def clear_all_outputs():
             return None, "", "Raw output will appear here.", "", None, None, "", None
         process_btn.click(
             fn=process_document_stream,
+            inputs=[image_input, prompt_input] + advanced_settings,
             outputs=[raw_output_stream, markdown_output]
         )
         ocr_button.click(
+            fn=extract_text_with_boxes,
+            inputs=[image_input] + advanced_settings,
             outputs=[ocr_text, ocr_vis]
         )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
             inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],