document-extraction

Sleeping

vkumartr commited on Feb 11, 2025

Commit

acc3d5a

verified ·

1 Parent(s): 426d1ac

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -100,28 +100,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
     Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
     Ensures accurate JSON schema binding.
     """
-    system_prompt = """You are an expert in invoice data extraction.
-Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
-Extract the following fields:
-1. Line Items: A list containing:
-   - Product Code
-   - Description
-   - Amount (numeric)
-2. Tax Amount (if available)
-3. Vendor GST (if available)
-4. Vendor Name
-5. Invoice Date (format: "DD-MMM-YYYY")
-6. Total Amount (numeric)
-7. Invoice Number (alpha-numeric)
-8. Vendor Address
-9. Invoice Currency
-Ensure that:
-- All extracted fields match the invoice.
-- If any field is missing, return null instead of hallucinating data.
-- Do not generate synthetic values—only extract real information from the image.
-"""
     base64_images = []
     base64DataResp = []

     Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
     Ensures accurate JSON schema binding.
     """
+    system_prompt = "You are an expert in invoice data extraction."
     base64_images = []
     base64DataResp = []