document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 7, 2025

Commit

cf26152

verified ·

1 Parent(s): 9910dd1

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -3

app.py CHANGED Viewed

@@ -114,6 +114,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
                 img.save(img_byte_arr, format="PNG", dpi=(300, 300))
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
                 base64_images.append(f"data:{content_type};base64,{base64_encoded}")
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")
@@ -123,6 +124,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
         # Handle direct image files
         base64_encoded = base64.b64encode(file_data).decode('utf-8')
         base64_images.append(f"data:{content_type.lower()};base64,{base64_encoded}")
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
@@ -140,11 +142,11 @@ def extract_invoice_data(file_data, content_type, json_schema):
         )
         parsed_content = json.loads(response.choices[0].message.content.strip())
-        return parsed_content, base64_images
     except Exception as e:
         logger.error(f"Error in OpenAI processing: {e}")
-        return {"error": str(e)}, base64_images
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
@@ -214,7 +216,7 @@ def extract_text_from_file(
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64DataResp": base64_images,
             "extracted_data": extracted_data
         }

                 img.save(img_byte_arr, format="PNG", dpi=(300, 300))
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
                 base64_images.append(f"data:{content_type};base64,{base64_encoded}")
+                base64dataresp = f"data:{content_type};base64,{base64_encoded}"
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")
         # Handle direct image files
         base64_encoded = base64.b64encode(file_data).decode('utf-8')
         base64_images.append(f"data:{content_type.lower()};base64,{base64_encoded}")
+        base64dataresp = f"data:{content_type.lower()};base64,{base64_encoded}"
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
         )
         parsed_content = json.loads(response.choices[0].message.content.strip())
+        return parsed_content, base64dataresp
     except Exception as e:
         logger.error(f"Error in OpenAI processing: {e}")
+        return {"error": str(e)}, base64dataresp
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
+            "base64DataResp": base64dataresp,
             "extracted_data": extracted_data
         }