document-extraction

Sleeping

vkumartr commited on Feb 11, 2025

Commit

6761af7

verified ·

1 Parent(s): acc3d5a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ from fastapi.staticfiles import StaticFiles
 import hashlib
 from enum import Enum
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
-from PIL import Image
 from pdf2image import convert_from_bytes
 import io
 import fitz  # PyMuPDF for PDF handling
@@ -100,7 +99,28 @@ def extract_invoice_data(file_data, content_type, json_schema):
     Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
     Ensures accurate JSON schema binding.
     """
-    system_prompt = "You are an expert in invoice data extraction."
     base64_images = []
     base64DataResp = []

 import hashlib
 from enum import Enum
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
 from pdf2image import convert_from_bytes
 import io
 import fitz  # PyMuPDF for PDF handling
     Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
     Ensures accurate JSON schema binding.
     """
+    system_prompt = """You are an expert in invoice data extraction.
+Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
+Extract the following fields:
+1. Line Items: A list containing:
+   - Product Code
+   - Description
+   - Amount (numeric)
+2. Tax Amount (if available)
+3. Vendor GST (if available)
+4. Vendor Name
+5. Invoice Date (format: "DD-MMM-YYYY")
+6. Total Amount (numeric)
+7. Invoice Number (alpha-numeric)
+8. Vendor Address
+9. Invoice Currency
+Ensure that:
+- All extracted fields match the invoice.
+- If any field is missing, return null instead of hallucinating data.
+- Do not generate synthetic values—only extract real information from the image.
+"""
     base64_images = []
     base64DataResp = []