pdf_gemini

Sleeping

App Files Files Community

Sebbe33 commited on Feb 16, 2025

Commit

62c24e3

verified ·

1 Parent(s): 6018547

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -93

app.py CHANGED Viewed

@@ -9,108 +9,75 @@ from pdf2image import convert_from_bytes
 # Constants
 DETECTION_PROMPT = """\
-Identify all text regions in this document. Provide bounding boxes in the format [xmin, ymin, xmax, ymax]
-as percentages of the image dimensions. Return only a Python-style list of lists without any additional text.
-Example: [[0.1, 0.2, 0.4, 0.5], [0.6, 0.7, 0.8, 0.9]]
 """
-# Helper functions
 def parse_list_boxes(text):
-    """Extracts bounding boxes from response text"""
-    pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
-    matches = re.findall(pattern, text)
-    return [[float(m) for m in match] for match in matches]
 def draw_bounding_boxes(image, boxes):
-    """Draws bounding boxes on the image using [xmin, ymin, xmax, ymax] format"""
     draw = ImageDraw.Draw(image)
     width, height = image.size
     for box in boxes:
-        xmin = max(0.0, min(1.0, box[0]))
-        ymin = max(0.0, min(1.0, box[1]))
-        xmax = max(0.0, min(1.0, box[2]))
-        ymax = max(0.0, min(1.0, box[3]))
-        draw.rectangle([
-            xmin * width,
-            ymin * height,
-            xmax * width,
-            ymax * height
-        ], outline="#00FF00", width=2)
     return image
 # Streamlit UI
-st.title("PDF Text Region Detection with Gemini")
-col1, col2 = st.columns(2)
-with col1:
-    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
-    if uploaded_file:
-        if st.button("Analyze Document"):
-            with st.spinner("Analyzing PDF..."):
-                try:
-                    # Convert PDF to images
-                    pdf_bytes = uploaded_file.read()
-                    images = convert_from_bytes(pdf_bytes)
-                    results = []
-                    # Initialize client
-                    client = genai.Client(api_key=os.getenv("KEY"))
-                    for page_num, image in enumerate(images):
-                        # Prepare image
-                        img_byte_arr = io.BytesIO()
-                        image.save(img_byte_arr, format='PNG')
-                        image_part = types.Part.from_bytes(
-                            data=img_byte_arr.getvalue(),
-                            mime_type="image/png"
-                        )
-                        # Get all text boxes
-                        box_response = client.models.generate_content(
-                            model="gemini-2.0-flash-exp",
-                            contents=[DETECTION_PROMPT, image_part]
-                        )
-                        # Get description
-                        desc_response = client.models.generate_content(
-                            model="gemini-2.0-flash-exp",
-                            contents=["Describe this document section in detail.", image_part]
-                        )
-                        # Process boxes
-                        try:
-                            boxes = parse_list_boxes(box_response.text)
-                        except Exception as e:
-                            st.error(f"Error on page {page_num+1}: {str(e)}")
-                            boxes = []
-                        # Draw boxes
-                        annotated_image = image.copy()
-                        if boxes:
-                            annotated_image = draw_bounding_boxes(annotated_image, boxes)
-                        results.append({
-                            "page": page_num + 1,
-                            "image": annotated_image,
-                            "description": desc_response.text,
-                            "boxes": len(boxes)
-                        })
-                    # Display results
-                    with col2:
-                        st.write(f"## Results ({len(results)} pages)")
-                        tabs = st.tabs([f"Page {res['page']}" for res in results])
-                        for tab, res in zip(tabs, results):
-                            with tab:
-                                st.image(res["image"],
-                                       caption=f"Page {res['page']} - {res['boxes']} text regions detected",
-                                       use_container_width=True)
-                                st.write("**Description:**", res["description"])
-                except Exception as e:
-                    st.error(f"Error: {str(e)}")

 # Constants
 DETECTION_PROMPT = """\
+Identify ALL text regions in this document. Return bounding boxes as a Python list of lists
+in format [[xmin, ymin, xmax, ymax]] where coordinates are normalized between 0-1.
+Only return the list, nothing else. Example:
+[[0.05, 0.12, 0.25, 0.18], [0.30, 0.40, 0.50, 0.55]]
 """
 def parse_list_boxes(text):
+    """Improved parsing with better error handling"""
+    try:
+        return eval(text)  # Safer alternative: Use ast.literal_eval
+    except:
+        matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text)
+        return [[float(x) for x in m] for m in matches]
 def draw_bounding_boxes(image, boxes):
+    """Enhanced drawing with diagnostics"""
+    if not boxes:
+        return image
     draw = ImageDraw.Draw(image)
     width, height = image.size
     for box in boxes:
+        try:
+            xmin = max(0.0, min(1.0, box[0])) * width
+            ymin = max(0.0, min(1.0, box[1])) * height
+            xmax = max(0.0, min(1.0, box[2])) * width
+            ymax = max(0.0, min(1.0, box[3])) * height
+            draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
+        except Exception as e:
+            st.error(f"Error drawing box: {str(e)}")
     return image
 # Streamlit UI
+st.title("PDF Text Detection")
+uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
+if uploaded_file and st.button("Analyze"):
+    with st.spinner("Processing..."):
+        try:
+            images = convert_from_bytes(uploaded_file.read(), dpi=300)  # Increased DPI
+            client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))  # Verify env var name
+            for idx, image in enumerate(images):
+                with st.expander(f"Page {idx+1}", expanded=True):
+                    img_byte_arr = io.BytesIO()
+                    image.save(img_byte_arr, format='PNG')
+                    # Get bounding boxes
+                    response = client.models.generate_content(
+                        model="gemini-1.5-pro-latest",  # Try newer model
+                        contents=[DETECTION_PROMPT, types.Part.from_bytes(img_byte_arr.getvalue(), "image/png")]
+                    )
+                    # Debug output
+                    with st.expander("Raw API Response"):
+                        st.code(response.text)
+                    # Parse and draw
+                    boxes = parse_list_boxes(response.text)
+                    annotated = draw_bounding_boxes(image.copy(), boxes)
+                    # Display
+                    cols = st.columns(2)
+                    cols[0].image(image, caption="Original", use_column_width=True)
+                    cols[1].image(annotated,
+                                 caption=f"Detected {len(boxes)} text regions",
+                                 use_column_width=True)
+        except Exception as e:
+            st.error(f"Error: {str(e)}")