document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 10, 2025

Commit

e9cefc0

verified ·

1 Parent(s): 24259cd

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -1

app.py CHANGED Viewed

@@ -93,15 +93,49 @@ def extract_pdf_text(file_data):
         logger.error(f"PDF Extraction Error: {e}")
         return None
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
-    system_prompt = "You are an expert in document data extraction."
     base64_images = []
     base64DataResp = []
     if content_type == "application/pdf":
         try:
@@ -127,12 +161,17 @@ def extract_invoice_data(file_data, content_type, json_schema):
             return {"error": "Failed to process PDF"}, None
     elif content_type.startswith("image/"):
         # Handle direct image files
         base64_pdf = base64.b64encode(file_data).decode('utf-8')
         base64DataResp.append(f"data:{content_type};base64,{base64_pdf}")
     else:
         return {"error": f"Unsupported file type: {content_type}"}
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]

         logger.error(f"PDF Extraction Error: {e}")
         return None
+# def extract_text_from_image(file_data):
+#     """
+#     Extracts text from a image file using pytesseract
+#     """
+#     try:
+#         image = Image.open(io.BytesIO(image_data))
+#         extracted_text = pytesseract.image_to_string(image)
+#         return extracted_text.strip()
+#     except Exception as e:
+#         logger.error(f"Image Extraction Error: {e}")
+#         return None
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
+    system_prompt = """You are an expert in invoice data extraction.
+Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
+Extract the following fields:
+1. Line Items: A list containing:
+   - Product Code
+   - Description
+   - Amount (numeric)
+2. Tax Amount (if available)
+3. Vendor GST (if available)
+4. Vendor Name
+5. Invoice Date (format: "DD-MMM-YYYY", e.g., "15-Sep-2023")
+6. Total Amount (numeric)
+7. Invoice Number (e.g., "INV-2023-001")
+8. Vendor Address
+9. Invoice Currency (e.g., "USD", "EUR")
+Ensure that:
+- All extracted fields match the invoice.
+- If any field is missing, return null instead of hallucinating data.
+- Do not generate synthetic values—only extract real information from the image.
+"""
     base64_images = []
     base64DataResp = []
+    extracted_text = ""
     if content_type == "application/pdf":
         try:
             return {"error": "Failed to process PDF"}, None
     elif content_type.startswith("image/"):
+        # extracted_text = extract_text_from_image(file_data)  # OCR extraction
         # Handle direct image files
         base64_pdf = base64.b64encode(file_data).decode('utf-8')
         base64DataResp.append(f"data:{content_type};base64,{base64_pdf}")
     else:
         return {"error": f"Unsupported file type: {content_type}"}
+    if extracted_text:
+            return {"extracted_text": extracted_text}, base64DataResp
     # Prepare OpenAI request
     openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]