prernajeet01 commited on
Commit
e91d17a
·
verified ·
1 Parent(s): 6de13b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -45
app.py CHANGED
@@ -12,7 +12,7 @@ import json
12
  import re
13
  import time
14
  import numpy as np
15
- import PyMuPDF as fitz # PyMuPDF for PDF image extraction
16
  from dotenv import load_dotenv
17
  from cassandra.cluster import Cluster
18
  from cassandra.auth import PlainTextAuthProvider
@@ -170,55 +170,53 @@ def get_embeddings_model():
170
 
171
  # Extract images from PDFs and store in Astra DB
172
  def extract_images_from_pdf(pdf_content, product_type):
173
- """Extract images from PDF and store them in Astra DB"""
174
  if not astra_session:
175
  return 0
176
 
177
  try:
178
- # Open PDF from bytes
179
- pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
180
- images_stored = 0
181
-
182
- # Extract images from each page
183
- for page_num in range(len(pdf_document)):
184
- page = pdf_document[page_num]
185
- image_list = page.get_images(full=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- for img_index, img_info in enumerate(image_list):
188
- # Extract image
189
- xref = img_info[0]
190
- base_image = pdf_document.extract_image(xref)
191
- image_bytes = base_image["image"]
192
-
193
- # Skip very small images (likely icons or decorative elements)
194
- if len(image_bytes) < 5000: # Skip images smaller than ~5KB
195
- continue
196
-
197
- # Generate a unique ID for the image
198
- image_id = str(uuid.uuid4())
199
-
200
- # Store metadata
201
- metadata = json.dumps({
202
- "product_type": product_type,
203
- "page_number": page_num,
204
- "image_index": img_index,
205
- "timestamp": time.time(),
206
- "image_size": len(image_bytes),
207
- "mime_type": base_image["ext"]
208
- })
209
-
210
- # Insert into Astra DB
211
- astra_session.execute(
212
- f"""
213
- INSERT INTO {astra_keyspace}.product_images
214
- (id, product_type, image_data, page_number, image_index, metadata)
215
- VALUES (%s, %s, %s, %s, %s, %s)
216
- """,
217
- (image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
218
- )
219
- images_stored += 1
220
-
221
- pdf_document.close()
222
  return images_stored
223
  except Exception as e:
224
  print(f"Error extracting images from PDF: {e}")
 
12
  import re
13
  import time
14
  import numpy as np
15
+ import pdfplumber
16
  from dotenv import load_dotenv
17
  from cassandra.cluster import Cluster
18
  from cassandra.auth import PlainTextAuthProvider
 
170
 
171
  # Extract images from PDFs and store in Astra DB
172
  def extract_images_from_pdf(pdf_content, product_type):
173
+ """Extract images from PDF using pdfplumber and store them in Astra DB"""
174
  if not astra_session:
175
  return 0
176
 
177
  try:
178
+ # Create a BytesIO object from the PDF content
179
+ pdf_file = io.BytesIO(pdf_content)
180
+
181
+ # Open the PDF with pdfplumber
182
+ with pdfplumber.open(pdf_file) as pdf:
183
+ images_stored = 0
184
+
185
+ # Iterate through each page
186
+ for page_num, page in enumerate(pdf.pages):
187
+ # Extract images from the page
188
+ for img_index, img in enumerate(page.images):
189
+ # Get image data
190
+ image_bytes = img["stream"].get_data()
191
+
192
+ # Skip small images
193
+ if len(image_bytes) < 5000:
194
+ continue
195
+
196
+ # Generate a unique ID for the image
197
+ image_id = str(uuid.uuid4())
198
+
199
+ # Store metadata
200
+ metadata = json.dumps({
201
+ "product_type": product_type,
202
+ "page_number": page_num,
203
+ "image_index": img_index,
204
+ "timestamp": time.time(),
205
+ "image_size": len(image_bytes),
206
+ "mime_type": "jpg" # Default to jpg for simplicity
207
+ })
208
+
209
+ # Insert into Astra DB
210
+ astra_session.execute(
211
+ f"""
212
+ INSERT INTO {astra_keyspace}.product_images
213
+ (id, product_type, image_data, page_number, image_index, metadata)
214
+ VALUES (%s, %s, %s, %s, %s, %s)
215
+ """,
216
+ (image_id, product_type, bytearray(image_bytes), page_num, img_index, metadata)
217
+ )
218
+ images_stored += 1
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  return images_stored
221
  except Exception as e:
222
  print(f"Error extracting images from PDF: {e}")