document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 10, 2025

Commit

5452b1d

verified ·

1 Parent(s): 27a46d9

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -10

app.py CHANGED Viewed

@@ -97,8 +97,8 @@ def extract_pdf_text(file_data):
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
-    Extracts data from a PDF (converted to images) or an image.
-    Only PDFs with 1 or 2 pages are allowed.
     """
     system_prompt = """You are an expert in invoice data extraction.
 Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
@@ -111,21 +111,22 @@ Extract the following fields:
 2. Tax Amount (if available)
 3. Vendor GST (if available)
 4. Vendor Name
-5. Invoice Date (format: "DD-MMM-YYYY", e.g., "15-Sep-2023")
 6. Total Amount (numeric)
-7. Invoice Number (e.g., "INV-2023-001")
 8. Vendor Address
-9. Invoice Currency (e.g., "USD", "EUR")
 Ensure that:
 - All extracted fields match the invoice.
 - If any field is missing, return null instead of hallucinating data.
 - Do not generate synthetic values—only extract real information from the image.
 """
     base64_images = []
     base64DataResp = []
     extracted_text = ""
     if content_type == "application/pdf":
         try:
             extracted_text = extract_pdf_text(file_data)
@@ -151,14 +152,15 @@ Ensure that:
     elif content_type.startswith("image/"):
         # Handle direct image files
-        base64_pdf = base64.b64encode(file_data).decode('utf-8')
-        base64DataResp.append(f"data:{content_type};base64,{base64_pdf}")
     else:
         return {"error": f"Unsupported file type: {content_type}"}
     if extracted_text:
-            return {"extracted_text": extracted_text}, base64DataResp
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]

 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
+    Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
+    Ensures accurate JSON schema binding.
     """
     system_prompt = """You are an expert in invoice data extraction.
 Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
 2. Tax Amount (if available)
 3. Vendor GST (if available)
 4. Vendor Name
+5. Invoice Date (format: "DD-MMM-YYYY")
 6. Total Amount (numeric)
+7. Invoice Number (alpha-numeric)
 8. Vendor Address
+9. Invoice Currency
 Ensure that:
 - All extracted fields match the invoice.
 - If any field is missing, return null instead of hallucinating data.
 - Do not generate synthetic values—only extract real information from the image.
 """
     base64_images = []
     base64DataResp = []
     extracted_text = ""
     if content_type == "application/pdf":
         try:
             extracted_text = extract_pdf_text(file_data)
     elif content_type.startswith("image/"):
         # Handle direct image files
+        base64_img = base64.b64encode(file_data).decode('utf-8')
+        base64DataResp.append(f"data:{content_type};base64,{base64_img}")
+        base64_images.append(f"data:{content_type};base64,{base64_img}")
     else:
         return {"error": f"Unsupported file type: {content_type}"}
     if extracted_text:
+        return {"extracted_text": extracted_text}, base64DataResp
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]