pdf_gemini

Sleeping

App Files Files Community

Sebbe33 commited on Feb 18, 2025

Commit

d2aded5

verified ·

1 Parent(s): 66952ef

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -10

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import re
 import io
 import streamlit as st
-from PIL import Image, ImageDraw
 from google import genai
 from google.genai import types
 from pdf2image import convert_from_bytes
@@ -15,6 +15,8 @@ Only return the list, nothing else. Example:
 [[0.05, 0.12, 0.25, 0.18], [0.30, 0.40, 0.50, 0.55]]
 """
 def parse_list_boxes(text):
     """Improved parsing with better error handling"""
     try:
@@ -24,25 +26,65 @@ def parse_list_boxes(text):
         return [[float(x) for x in m] for m in matches]
 def draw_bounding_boxes(image, boxes):
-    """Enhanced drawing with diagnostics"""
     if not boxes:
         return image
     draw = ImageDraw.Draw(image)
     width, height = image.size
-    for box in boxes:
         try:
             xmin = max(0.0, min(1.0, box[0])) * width
             ymin = max(0.0, min(1.0, box[1])) * height
             xmax = max(0.0, min(1.0, box[2])) * width
             ymax = max(0.0, min(1.0, box[3])) * height
             draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
         except Exception as e:
             st.error(f"Error drawing box: {str(e)}")
     return image
 # Streamlit UI
 st.title("PDF Text Detection")
 uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
@@ -53,7 +95,6 @@ if uploaded_file and st.button("Analyze"):
             images = convert_from_bytes(uploaded_file.read(), dpi=300)
             client = genai.Client(api_key=os.getenv("KEY"))
-            # Create tabs for pages
             tabs = st.tabs([f"Page {i+1}" for i in range(len(images))])
             for idx, (tab, image) in enumerate(zip(tabs, images)):
@@ -61,13 +102,12 @@ if uploaded_file and st.button("Analyze"):
                     col1, col2 = st.columns(2)
                     with col1:
-                        st.image(image, caption="Original", use_container_width=True)
                     with col2:
                         img_byte_arr = io.BytesIO()
                         image.save(img_byte_arr, format='PNG')
-                        # Get bounding boxes
                         response = client.models.generate_content(
                             model="gemini-2.0-flash-exp",
                             contents=[
@@ -79,14 +119,21 @@ if uploaded_file and st.button("Analyze"):
                             ]
                         )
-                        # Parse and draw
                         boxes = parse_list_boxes(response.text)
-                        annotated = draw_bounding_boxes(image.copy(), boxes)
                         st.image(annotated,
                                caption=f"Detected {len(boxes)} text regions",
-                               use_container_width=True)
                         # Debug section
                         debug_expander = st.expander("Debug Details")
                         with debug_expander:

 import re
 import io
 import streamlit as st
+from PIL import Image, ImageDraw, ImageFont
 from google import genai
 from google.genai import types
 from pdf2image import convert_from_bytes
 [[0.05, 0.12, 0.25, 0.18], [0.30, 0.40, 0.50, 0.55]]
 """
+TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else."
 def parse_list_boxes(text):
     """Improved parsing with better error handling"""
     try:
         return [[float(x) for x in m] for m in matches]
 def draw_bounding_boxes(image, boxes):
+    """Enhanced drawing with numbering"""
     if not boxes:
         return image
     draw = ImageDraw.Draw(image)
     width, height = image.size
+    for i, box in enumerate(boxes):
         try:
+            # Convert normalized coordinates to pixel values
             xmin = max(0.0, min(1.0, box[0])) * width
             ymin = max(0.0, min(1.0, box[1])) * height
             xmax = max(0.0, min(1.0, box[2])) * width
             ymax = max(0.0, min(1.0, box[3])) * height
+            # Draw bounding box
             draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
+            # Draw number label
+            label = str(i+1)
+            draw.text((xmin + 5, ymin + 5), label, fill="red")
         except Exception as e:
             st.error(f"Error drawing box: {str(e)}")
     return image
+def extract_text_from_region(client, image, box):
+    """Extract text from a specific region using Gemini"""
+    try:
+        width, height = image.size
+        # Convert normalized coordinates to pixel values
+        xmin = int(max(0.0, min(1.0, box[0])) * width)
+        ymin = int(max(0.0, min(1.0, box[1])) * height)
+        xmax = int(max(0.0, min(1.0, box[2])) * width)
+        ymax = int(max(0.0, min(1.0, box[3])) * height)
+        if xmin >= xmax or ymin >= ymax:
+            return ""
+        # Crop and convert to bytes
+        cropped = image.crop((xmin, ymin, xmax, ymax))
+        img_byte_arr = io.BytesIO()
+        cropped.save(img_byte_arr, format='PNG')
+        # Call Gemini API
+        response = client.models.generate_content(
+            model="gemini-2.0-flash-exp",
+            contents=[
+                TEXT_EXTRACTION_PROMPT,
+                types.Part.from_bytes(
+                    data=img_byte_arr.getvalue(),
+                    mime_type="image/png"
+                )
+            ]
+        )
+        return response.text.strip()
+    except Exception as e:
+        st.error(f"Text extraction error: {str(e)}")
+        return ""
 # Streamlit UI
 st.title("PDF Text Detection")
 uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
             images = convert_from_bytes(uploaded_file.read(), dpi=300)
             client = genai.Client(api_key=os.getenv("KEY"))
             tabs = st.tabs([f"Page {i+1}" for i in range(len(images))])
             for idx, (tab, image) in enumerate(zip(tabs, images)):
                     col1, col2 = st.columns(2)
                     with col1:
+                        st.image(image, caption="Original", use_column_width=True)
                     with col2:
+                        # Get bounding boxes
                         img_byte_arr = io.BytesIO()
                         image.save(img_byte_arr, format='PNG')
                         response = client.models.generate_content(
                             model="gemini-2.0-flash-exp",
                             contents=[
                             ]
                         )
                         boxes = parse_list_boxes(response.text)
+                        texts = [extract_text_from_region(client, image, box) for box in boxes]
+                        # Draw annotated image
+                        annotated = draw_bounding_boxes(image.copy(), boxes)
                         st.image(annotated,
                                caption=f"Detected {len(boxes)} text regions",
+                               use_column_width=True)
+                        # Display extracted texts
+                        if any(texts):
+                            st.subheader("Extracted Texts:")
+                            for i, text in enumerate(texts, 1):
+                                st.write(f"{i}. {text if text else 'No text detected'}")
                         # Debug section
                         debug_expander = st.expander("Debug Details")
                         with debug_expander: