Spaces:

prernajeet01
/

ABB

Sleeping

App Files Files Community

prernajeet01 commited on Mar 10, 2025

Commit

e91d17a

verified ·

1 Parent(s): 6de13b0

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -45

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import json
 import re
 import time
 import numpy as np
-import PyMuPDF as fitz  # PyMuPDF for PDF image extraction
 from dotenv import load_dotenv
 from cassandra.cluster import Cluster
 from cassandra.auth import PlainTextAuthProvider
@@ -170,55 +170,53 @@ def get_embeddings_model():
 # Extract images from PDFs and store in Astra DB
 def extract_images_from_pdf(pdf_content, product_type):
-    """Extract images from PDF and store them in Astra DB"""
     if not astra_session:
         return 0
     try:
-        # Open PDF from bytes
-        pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
-        images_stored = 0
-        # Extract images from each page
-        for page_num in range(len(pdf_document)):
-            page = pdf_document[page_num]
-            image_list = page.get_images(full=True)
-            for img_index, img_info in enumerate(image_list):
-                # Extract image
-                xref = img_info[0]
-                base_image = pdf_document.extract_image(xref)
-                image_bytes = base_image["image"]
-                # Skip very small images (likely icons or decorative elements)
-                if len(image_bytes) < 5000:  # Skip images smaller than ~5KB
-                    continue
-                # Generate a unique ID for the image
-                image_id = str(uuid.uuid4())
-                # Store metadata
-                metadata = json.dumps({
-                    "product_type": product_type,
-                    "page_number": page_num,
-                    "image_index": img_index,
-                    "timestamp": time.time(),
-                    "image_size": len(image_bytes),
-                    "mime_type": base_image["ext"]
-                })
-                # Insert into Astra DB
-                astra_session.execute(
-                    f"""
-                    INSERT INTO {astra_keyspace}.product_images
-                    (id, product_type, image_data, page_number, image_index, metadata)
-                    VALUES (%s, %s, %s, %s, %s, %s)
-                    """,
-                    (image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
-                )
-                images_stored += 1
-        pdf_document.close()
         return images_stored
     except Exception as e:
         print(f"Error extracting images from PDF: {e}")

 import re
 import time
 import numpy as np
+import pdfplumber
 from dotenv import load_dotenv
 from cassandra.cluster import Cluster
 from cassandra.auth import PlainTextAuthProvider
 # Extract images from PDFs and store in Astra DB
 def extract_images_from_pdf(pdf_content, product_type):
+    """Extract images from PDF using pdfplumber and store them in Astra DB"""
     if not astra_session:
         return 0
     try:
+        # Create a BytesIO object from the PDF content
+        pdf_file = io.BytesIO(pdf_content)
+        # Open the PDF with pdfplumber
+        with pdfplumber.open(pdf_file) as pdf:
+            images_stored = 0
+            # Iterate through each page
+            for page_num, page in enumerate(pdf.pages):
+                # Extract images from the page
+                for img_index, img in enumerate(page.images):
+                    # Get image data
+                    image_bytes = img["stream"].get_data()
+                    # Skip small images
+                    if len(image_bytes) < 5000:
+                        continue
+                    # Generate a unique ID for the image
+                    image_id = str(uuid.uuid4())
+                    # Store metadata
+                    metadata = json.dumps({
+                        "product_type": product_type,
+                        "page_number": page_num,
+                        "image_index": img_index,
+                        "timestamp": time.time(),
+                        "image_size": len(image_bytes),
+                        "mime_type": "jpg"  # Default to jpg for simplicity
+                    })
+                    # Insert into Astra DB
+                    astra_session.execute(
+                        f"""
+                        INSERT INTO {astra_keyspace}.product_images
+                        (id, product_type, image_data, page_number, image_index, metadata)
+                        VALUES (%s, %s, %s, %s, %s, %s)
+                        """,
+                        (image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
+                    )
+                    images_stored += 1
         return images_stored
     except Exception as e:
         print(f"Error extracting images from PDF: {e}")