Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ import json
|
|
| 12 |
import re
|
| 13 |
import time
|
| 14 |
import numpy as np
|
| 15 |
-
import
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
from cassandra.cluster import Cluster
|
| 18 |
from cassandra.auth import PlainTextAuthProvider
|
|
@@ -170,55 +170,53 @@ def get_embeddings_model():
|
|
| 170 |
|
| 171 |
# Extract images from PDFs and store in Astra DB
|
| 172 |
def extract_images_from_pdf(pdf_content, product_type):
|
| 173 |
-
"""Extract images from PDF and store them in Astra DB"""
|
| 174 |
if not astra_session:
|
| 175 |
return 0
|
| 176 |
|
| 177 |
try:
|
| 178 |
-
#
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
for img_index, img_info in enumerate(image_list):
|
| 188 |
-
# Extract image
|
| 189 |
-
xref = img_info[0]
|
| 190 |
-
base_image = pdf_document.extract_image(xref)
|
| 191 |
-
image_bytes = base_image["image"]
|
| 192 |
-
|
| 193 |
-
# Skip very small images (likely icons or decorative elements)
|
| 194 |
-
if len(image_bytes) < 5000: # Skip images smaller than ~5KB
|
| 195 |
-
continue
|
| 196 |
-
|
| 197 |
-
# Generate a unique ID for the image
|
| 198 |
-
image_id = str(uuid.uuid4())
|
| 199 |
-
|
| 200 |
-
# Store metadata
|
| 201 |
-
metadata = json.dumps({
|
| 202 |
-
"product_type": product_type,
|
| 203 |
-
"page_number": page_num,
|
| 204 |
-
"image_index": img_index,
|
| 205 |
-
"timestamp": time.time(),
|
| 206 |
-
"image_size": len(image_bytes),
|
| 207 |
-
"mime_type": base_image["ext"]
|
| 208 |
-
})
|
| 209 |
-
|
| 210 |
-
# Insert into Astra DB
|
| 211 |
-
astra_session.execute(
|
| 212 |
-
f"""
|
| 213 |
-
INSERT INTO {astra_keyspace}.product_images
|
| 214 |
-
(id, product_type, image_data, page_number, image_index, metadata)
|
| 215 |
-
VALUES (%s, %s, %s, %s, %s, %s)
|
| 216 |
-
""",
|
| 217 |
-
(image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
|
| 218 |
-
)
|
| 219 |
-
images_stored += 1
|
| 220 |
-
|
| 221 |
-
pdf_document.close()
|
| 222 |
return images_stored
|
| 223 |
except Exception as e:
|
| 224 |
print(f"Error extracting images from PDF: {e}")
|
|
|
|
| 12 |
import re
|
| 13 |
import time
|
| 14 |
import numpy as np
|
| 15 |
+
import pdfplumber
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
from cassandra.cluster import Cluster
|
| 18 |
from cassandra.auth import PlainTextAuthProvider
|
|
|
|
| 170 |
|
| 171 |
# Extract images from PDFs and store in Astra DB
|
| 172 |
def extract_images_from_pdf(pdf_content, product_type):
|
| 173 |
+
"""Extract images from PDF using pdfplumber and store them in Astra DB"""
|
| 174 |
if not astra_session:
|
| 175 |
return 0
|
| 176 |
|
| 177 |
try:
|
| 178 |
+
# Create a BytesIO object from the PDF content
|
| 179 |
+
pdf_file = io.BytesIO(pdf_content)
|
| 180 |
+
|
| 181 |
+
# Open the PDF with pdfplumber
|
| 182 |
+
with pdfplumber.open(pdf_file) as pdf:
|
| 183 |
+
images_stored = 0
|
| 184 |
+
|
| 185 |
+
# Iterate through each page
|
| 186 |
+
for page_num, page in enumerate(pdf.pages):
|
| 187 |
+
# Extract images from the page
|
| 188 |
+
for img_index, img in enumerate(page.images):
|
| 189 |
+
# Get image data
|
| 190 |
+
image_bytes = img["stream"].get_data()
|
| 191 |
+
|
| 192 |
+
# Skip small images
|
| 193 |
+
if len(image_bytes) < 5000:
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
# Generate a unique ID for the image
|
| 197 |
+
image_id = str(uuid.uuid4())
|
| 198 |
+
|
| 199 |
+
# Store metadata
|
| 200 |
+
metadata = json.dumps({
|
| 201 |
+
"product_type": product_type,
|
| 202 |
+
"page_number": page_num,
|
| 203 |
+
"image_index": img_index,
|
| 204 |
+
"timestamp": time.time(),
|
| 205 |
+
"image_size": len(image_bytes),
|
| 206 |
+
"mime_type": "jpg" # Default to jpg for simplicity
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
# Insert into Astra DB
|
| 210 |
+
astra_session.execute(
|
| 211 |
+
f"""
|
| 212 |
+
INSERT INTO {astra_keyspace}.product_images
|
| 213 |
+
(id, product_type, image_data, page_number, image_index, metadata)
|
| 214 |
+
VALUES (%s, %s, %s, %s, %s, %s)
|
| 215 |
+
""",
|
| 216 |
+
(image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
|
| 217 |
+
)
|
| 218 |
+
images_stored += 1
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
return images_stored
|
| 221 |
except Exception as e:
|
| 222 |
print(f"Error extracting images from PDF: {e}")
|