document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 7, 2025

Commit

e0f7bfa

verified ·

1 Parent(s): 78199be

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -7

app.py CHANGED Viewed

@@ -99,21 +99,23 @@ def extract_invoice_data(file_data, content_type, json_schema):
     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
-    system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
     base64_images = []
     if content_type == "application/pdf":
         try:
             images = convert_from_bytes(file_data)  # Convert PDF to images
             if len(images) > 2:
-                raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
             for img in images[:2]:  # Convert up to 2 pages
                 img_byte_arr = io.BytesIO()
-                img.save(img_byte_arr, format="PNG")
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
-                base64_images.append(f"data:{content_type};base64,{base64_encoded}")
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")
@@ -123,6 +125,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
         # Handle direct image files
         base64_encoded = base64.b64encode(file_data).decode('utf-8')
         base64_images.append(f"data:{content_type};base64,{base64_encoded}")
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
@@ -140,11 +143,11 @@ def extract_invoice_data(file_data, content_type, json_schema):
         )
         parsed_content = json.loads(response.choices[0].message.content.strip())
-        return parsed_content, base64_images
     except Exception as e:
         logger.error(f"Error in OpenAI processing: {e}")
-        return {"error": str(e)}, base64_images
 def get_content_type_from_s3(file_key):
@@ -215,7 +218,7 @@ def extract_text_from_file(
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64DataResp": base64_images,
             "extracted_data": extracted_data
         }

     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
+    system_prompt = "You are an expert in document data extraction."
     base64_images = []
     if content_type == "application/pdf":
         try:
+            extracted_text = extract_pdf_text(file_data)
             images = convert_from_bytes(file_data)  # Convert PDF to images
             if len(images) > 2:
+                raise ValueError("PDF contains more than 2 pages.")
             for img in images[:2]:  # Convert up to 2 pages
                 img_byte_arr = io.BytesIO()
+                img.save(img_byte_arr, format="PNG", dpi=(300, 300))
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                base64_images.append(f"data:image/png;base64,{base64_encoded}")
+                base64DataResp = f"data:image/png;base64,{base64_encoded}"
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")
         # Handle direct image files
         base64_encoded = base64.b64encode(file_data).decode('utf-8')
         base64_images.append(f"data:{content_type};base64,{base64_encoded}")
+        base64DataResp = f"data:image/png;base64,{base64_encoded}"
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
         )
         parsed_content = json.loads(response.choices[0].message.content.strip())
+        return parsed_content, base64DataResp
     except Exception as e:
         logger.error(f"Error in OpenAI processing: {e}")
+        return {"error": str(e)}, base64DataResp
 def get_content_type_from_s3(file_key):
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
+            "base64DataResp": base64DataResp,
             "extracted_data": extracted_data
         }