pdf_gemini

Sleeping

App Files Files Community

Sebbe33 commited on Feb 13, 2025

Commit

7e4f227

verified ·

1 Parent(s): 355b6ef

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -78

app.py CHANGED Viewed

@@ -5,123 +5,112 @@ import streamlit as st
 from PIL import Image, ImageDraw
 from google import genai
 from google.genai import types
-# Hilfsfunktionen
 def parse_list_boxes(text):
-    """Extrahiert Bounding Boxes aus dem Antworttext"""
     pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
     matches = re.findall(pattern, text)
     return [[float(m) for m in match] for match in matches]
 def draw_bounding_boxes(image, boxes):
-    """Zeichnet Bounding Boxes auf das Bild"""
     draw = ImageDraw.Draw(image)
     width, height = image.size
     for box in boxes:
-        # Sicherstellen, dass alle Werte zwischen 0-1 liegen
         ymin = max(0.0, min(1.0, box[0]))
         xmin = max(0.0, min(1.0, box[1]))
         ymax = max(0.0, min(1.0, box[2]))
         xmax = max(0.0, min(1.0, box[3]))
-        # Zeichne den Rahmen
         draw.rectangle([
             xmin * width,
             ymin * height,
             xmax * width,
             ymax * height
-        ], outline="#00FF00", width=7)  # Neon green mit dicken Linien
     return image
 # Streamlit UI
-st.title("Objekterkennung mit Gemini")
 col1, col2 = st.columns(2)
 with col1:
-    uploaded_file = st.file_uploader("Bild hochladen", type=["jpg", "png", "jpeg"])
-    object_name = st.text_input("Objekt zur Erkennung", placeholder="z.B. 'Auto', 'Person'")
-    if uploaded_file and object_name:
-        image = Image.open(uploaded_file)
-        width, height = image.size
-        st.image(image, caption="Hochgeladenes Bild", use_container_width=True)
         if st.button("Analysieren"):
-            with st.spinner("Analysiere Bild..."):
                 try:
-                    # Bildvorbereitung
-                    image_bytes = io.BytesIO()
-                    image.save(image_bytes, format=image.format)
-                    image_part = types.Part.from_bytes(
-                        data=image_bytes.getvalue(),
-                        mime_type=f"image/{image.format.lower()}"
-                    )
-                    # API-Client
                     client = genai.Client(api_key=os.getenv("KEY"))
-                    # Bildbeschreibung
-                    desc_response = client.models.generate_content(
-                        model="gemini-2.0-flash-exp",
-                        contents=["Beschreibe dieses Bild detailliert.", image_part]
-                    )
-                    # Objekterkennung
-                    detection_prompt = (
-                        f"Gib exakt 4 Dezimalzahlen pro Box für alle {object_name} im Format "
-                        "[ymin, xmin, ymax, xmax] als reine Python-Liste ohne weiteren Text. "
-                        "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
-                    )
-                    box_response = client.models.generate_content(
-                        model="gemini-2.0-flash-exp",
-                        contents=[detection_prompt, image_part]
-                    )
-                    # Verarbeitung
-                    try:
-                        boxes = parse_list_boxes(box_response.text)
-                        st.write("**Parsed Boxes:**", boxes)
-                    except Exception as e:
-                        st.error(f"Parsing Error: {str(e)}")
-                        boxes = []
-                    annotated_image = image.copy()
-                    if boxes:
-                        annotated_image = draw_bounding_boxes(annotated_image, boxes)
-                        result_text = f"{len(boxes)} {object_name} erkannt"
-                        # Zoom auf erste Box
-                        ymin, xmin, ymax, xmax = boxes[0]
-                        zoom_area = (
-                            max(0, int(xmin * width - 50)),
-                            max(0, int(ymin * height - 50)),
-                            min(width, int(xmax * width + 50)),
-                            min(height, int(ymax * height + 50))
                         )
-                        zoomed_image = annotated_image.crop(zoom_area)
-                    else:
-                        result_text = "Keine Objekte gefunden"
-                        zoomed_image = None
-                    # Ergebnisse anzeigen
-                    with col2:
-                        st.write("## Objekterkennung:")
-                        st.write(result_text)
                         if boxes:
-                            st.image(
-                                [annotated_image, zoomed_image],
-                                caption=["Gesamtbild", "Zoom auf Erkennung"],
-                                width=400
-                            )
-                        else:
-                            st.image(annotated_image, caption="Keine Objekte erkannt", width=400)
-                        st.write("## Beschreibung:")
-                        st.write(desc_response.text)
                 except Exception as e:
                     st.error(f"Fehler: {str(e)}")

 from PIL import Image, ImageDraw
 from google import genai
 from google.genai import types
+from pdf2image import convert_from_bytes
+# Helper functions
 def parse_list_boxes(text):
+    """Extracts bounding boxes from response text"""
     pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
     matches = re.findall(pattern, text)
     return [[float(m) for m in match] for match in matches]
 def draw_bounding_boxes(image, boxes):
+    """Draws bounding boxes on the image"""
     draw = ImageDraw.Draw(image)
     width, height = image.size
     for box in boxes:
         ymin = max(0.0, min(1.0, box[0]))
         xmin = max(0.0, min(1.0, box[1]))
         ymax = max(0.0, min(1.0, box[2]))
         xmax = max(0.0, min(1.0, box[3]))
         draw.rectangle([
             xmin * width,
             ymin * height,
             xmax * width,
             ymax * height
+        ], outline="#00FF00", width=3)
     return image
 # Streamlit UI
+st.title("PDF Themenerkennung mit Gemini")
 col1, col2 = st.columns(2)
 with col1:
+    uploaded_file = st.file_uploader("PDF hochladen", type=["pdf"])
+    topic_name = st.text_input("Thema zur Erkennung", placeholder="z.B. 'Überschrift', 'Tabelle', 'Absatz'")
+    if uploaded_file and topic_name:
         if st.button("Analysieren"):
+            with st.spinner("Analysiere PDF..."):
                 try:
+                    # Convert PDF to images
+                    pdf_bytes = uploaded_file.read()
+                    images = convert_from_bytes(pdf_bytes)
+                    results = []
+                    # Initialize client
                     client = genai.Client(api_key=os.getenv("KEY"))
+                    for page_num, image in enumerate(images):
+                        # Prepare image
+                        img_byte_arr = io.BytesIO()
+                        image.save(img_byte_arr, format='PNG')
+                        image_part = types.Part.from_bytes(
+                            data=img_byte_arr.getvalue(),
+                            mime_type="image/png"
+                        )
+                        # Get topic boxes
+                        detection_prompt = (
+                            f"Identifiziere alle {topic_name} Bereiche in diesem Dokument. "
+                            "Gib Bounding Boxes im Format [ymin, xmin, ymax, xmax] "
+                            "als reine Python-Liste ohne weiteren Text. "
+                            "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
+                        )
+                        box_response = client.models.generate_content(
+                            model="gemini-2.0-flash-exp",
+                            contents=[detection_prompt, image_part]
+                        )
+                        # Get description
+                        desc_response = client.models.generate_content(
+                            model="gemini-2.0-flash-exp",
+                            contents=["Beschreibe diesen Dokumentenausschnitt detailliert.", image_part]
                         )
+                        # Process boxes
+                        try:
+                            boxes = parse_list_boxes(box_response.text)
+                        except Exception as e:
+                            st.error(f"Fehler bei Seite {page_num+1}: {str(e)}")
+                            boxes = []
+                        # Draw boxes
+                        annotated_image = image.copy()
                         if boxes:
+                            annotated_image = draw_bounding_boxes(annotated_image, boxes)
+                        results.append({
+                            "page": page_num + 1,
+                            "image": annotated_image,
+                            "description": desc_response.text,
+                            "boxes": len(boxes)
+                        })
+                    # Display results
+                    with col2:
+                        st.write(f"## Ergebnisse ({len(results)} Seiten)")
+                        tabs = st.tabs([f"Seite {res['page']}" for res in results])
+                        for tab, res in zip(tabs, results):
+                            with tab:
+                                st.image(res["image"],
+                                       caption=f"Seite {res['page']} - {res['boxes']} {topic_name} erkannt",
+                                       use_container_width=True)
+                                st.write("**Beschreibung:**", res["description"])
                 except Exception as e:
                     st.error(f"Fehler: {str(e)}")