vkumartr commited on
Commit
a4b95eb
·
verified ·
1 Parent(s): 7c0c12d

Stores base64 before converting PDF to image

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -106,6 +106,10 @@ def extract_invoice_data(file_data, content_type, json_schema):
106
  if content_type == "application/pdf":
107
  try:
108
  extracted_text = extract_pdf_text(file_data)
 
 
 
 
109
  images = convert_from_bytes(file_data) # Convert PDF to images
110
 
111
  if len(images) > 2:
@@ -118,7 +122,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
118
  base64_images.append(f"data:image/png;base64,{base64_encoded}")
119
 
120
  # Store all images as a single JSON object
121
- base64DataResp = json.dumps(base64_images)
122
 
123
  except Exception as e:
124
  logger.error(f"Error converting PDF to image: {e}")
@@ -127,8 +131,9 @@ def extract_invoice_data(file_data, content_type, json_schema):
127
  else:
128
  # Handle direct image files
129
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
130
- base64_images.append(f"data:{content_type};base64,{base64_encoded}")
131
- base64DataResp = json.dumps(base64_images) # Store as a JSON object
 
132
 
133
  # Prepare OpenAI request
134
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
@@ -212,7 +217,6 @@ def extract_text_from_file(
212
  "entityrefkey": entity_ref_key,
213
  "extracted_data": extracted_data
214
  }
215
-
216
  inserted_doc = invoice_collection.insert_one(document)
217
  document_id = str(inserted_doc.inserted_id)
218
  logger.info(f"Document inserted with ID: {document_id}")
 
106
  if content_type == "application/pdf":
107
  try:
108
  extracted_text = extract_pdf_text(file_data)
109
+ # Store PDF as Base64
110
+ base64_pdf = base64.b64encode(file_data).decode('utf-8')
111
+ base64DataResp.append(f"data:application/pdf;base64,{base64_pdf}")
112
+
113
  images = convert_from_bytes(file_data) # Convert PDF to images
114
 
115
  if len(images) > 2:
 
122
  base64_images.append(f"data:image/png;base64,{base64_encoded}")
123
 
124
  # Store all images as a single JSON object
125
+ # base64DataResp = json.dumps(base64_images)
126
 
127
  except Exception as e:
128
  logger.error(f"Error converting PDF to image: {e}")
 
131
  else:
132
  # Handle direct image files
133
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
134
+ base64DataResp.append(f"data:{content_type};base64,{base64_encoded}")
135
+ # base64_images.append(f"data:{content_type};base64,{base64_encoded}")
136
+ # base64DataResp = json.dumps(base64_images) # Store as a JSON object
137
 
138
  # Prepare OpenAI request
139
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
 
217
  "entityrefkey": entity_ref_key,
218
  "extracted_data": extracted_data
219
  }
 
220
  inserted_doc = invoice_collection.insert_one(document)
221
  document_id = str(inserted_doc.inserted_id)
222
  logger.info(f"Document inserted with ID: {document_id}")