Spaces:
Sleeping
Sleeping
Stores base64 before converting PDF to image
Browse files
app.py
CHANGED
|
@@ -106,6 +106,10 @@ def extract_invoice_data(file_data, content_type, json_schema):
|
|
| 106 |
if content_type == "application/pdf":
|
| 107 |
try:
|
| 108 |
extracted_text = extract_pdf_text(file_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
images = convert_from_bytes(file_data) # Convert PDF to images
|
| 110 |
|
| 111 |
if len(images) > 2:
|
|
@@ -118,7 +122,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
|
|
| 118 |
base64_images.append(f"data:image/png;base64,{base64_encoded}")
|
| 119 |
|
| 120 |
# Store all images as a single JSON object
|
| 121 |
-
base64DataResp = json.dumps(base64_images)
|
| 122 |
|
| 123 |
except Exception as e:
|
| 124 |
logger.error(f"Error converting PDF to image: {e}")
|
|
@@ -127,8 +131,9 @@ def extract_invoice_data(file_data, content_type, json_schema):
|
|
| 127 |
else:
|
| 128 |
# Handle direct image files
|
| 129 |
base64_encoded = base64.b64encode(file_data).decode('utf-8')
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
|
| 133 |
# Prepare OpenAI request
|
| 134 |
openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
|
|
@@ -212,7 +217,6 @@ def extract_text_from_file(
|
|
| 212 |
"entityrefkey": entity_ref_key,
|
| 213 |
"extracted_data": extracted_data
|
| 214 |
}
|
| 215 |
-
|
| 216 |
inserted_doc = invoice_collection.insert_one(document)
|
| 217 |
document_id = str(inserted_doc.inserted_id)
|
| 218 |
logger.info(f"Document inserted with ID: {document_id}")
|
|
|
|
| 106 |
if content_type == "application/pdf":
|
| 107 |
try:
|
| 108 |
extracted_text = extract_pdf_text(file_data)
|
| 109 |
+
# Store PDF as Base64
|
| 110 |
+
base64_pdf = base64.b64encode(file_data).decode('utf-8')
|
| 111 |
+
base64DataResp.append(f"data:application/pdf;base64,{base64_pdf}")
|
| 112 |
+
|
| 113 |
images = convert_from_bytes(file_data) # Convert PDF to images
|
| 114 |
|
| 115 |
if len(images) > 2:
|
|
|
|
| 122 |
base64_images.append(f"data:image/png;base64,{base64_encoded}")
|
| 123 |
|
| 124 |
# Store all images as a single JSON object
|
| 125 |
+
# base64DataResp = json.dumps(base64_images)
|
| 126 |
|
| 127 |
except Exception as e:
|
| 128 |
logger.error(f"Error converting PDF to image: {e}")
|
|
|
|
| 131 |
else:
|
| 132 |
# Handle direct image files
|
| 133 |
base64_encoded = base64.b64encode(file_data).decode('utf-8')
|
| 134 |
+
base64DataResp.append(f"data:{content_type};base64,{base64_encoded}")
|
| 135 |
+
# base64_images.append(f"data:{content_type};base64,{base64_encoded}")
|
| 136 |
+
# base64DataResp = json.dumps(base64_images) # Store as a JSON object
|
| 137 |
|
| 138 |
# Prepare OpenAI request
|
| 139 |
openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
|
|
|
|
| 217 |
"entityrefkey": entity_ref_key,
|
| 218 |
"extracted_data": extracted_data
|
| 219 |
}
|
|
|
|
| 220 |
inserted_doc = invoice_collection.insert_one(document)
|
| 221 |
document_id = str(inserted_doc.inserted_id)
|
| 222 |
logger.info(f"Document inserted with ID: {document_id}")
|