Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -97,8 +97,8 @@ def extract_pdf_text(file_data):
|
|
| 97 |
# Function to summarize text using OpenAI GPT
|
| 98 |
def extract_invoice_data(file_data, content_type, json_schema):
|
| 99 |
"""
|
| 100 |
-
Extracts data from
|
| 101 |
-
|
| 102 |
"""
|
| 103 |
system_prompt = """You are an expert in invoice data extraction.
|
| 104 |
Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
|
|
@@ -111,21 +111,22 @@ Extract the following fields:
|
|
| 111 |
2. Tax Amount (if available)
|
| 112 |
3. Vendor GST (if available)
|
| 113 |
4. Vendor Name
|
| 114 |
-
5. Invoice Date (format: "DD-MMM-YYYY"
|
| 115 |
6. Total Amount (numeric)
|
| 116 |
-
7. Invoice Number (
|
| 117 |
8. Vendor Address
|
| 118 |
-
9. Invoice Currency
|
| 119 |
|
| 120 |
Ensure that:
|
| 121 |
- All extracted fields match the invoice.
|
| 122 |
- If any field is missing, return null instead of hallucinating data.
|
| 123 |
- Do not generate synthetic values—only extract real information from the image.
|
| 124 |
"""
|
|
|
|
| 125 |
base64_images = []
|
| 126 |
base64DataResp = []
|
| 127 |
extracted_text = ""
|
| 128 |
-
|
| 129 |
if content_type == "application/pdf":
|
| 130 |
try:
|
| 131 |
extracted_text = extract_pdf_text(file_data)
|
|
@@ -151,14 +152,15 @@ Ensure that:
|
|
| 151 |
|
| 152 |
elif content_type.startswith("image/"):
|
| 153 |
# Handle direct image files
|
| 154 |
-
|
| 155 |
-
base64DataResp.append(f"data:{content_type};base64,{
|
| 156 |
-
|
|
|
|
| 157 |
else:
|
| 158 |
return {"error": f"Unsupported file type: {content_type}"}
|
| 159 |
|
| 160 |
if extracted_text:
|
| 161 |
-
|
| 162 |
|
| 163 |
# Prepare OpenAI request
|
| 164 |
openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
|
|
|
|
| 97 |
# Function to summarize text using OpenAI GPT
|
| 98 |
def extract_invoice_data(file_data, content_type, json_schema):
|
| 99 |
"""
|
| 100 |
+
Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
|
| 101 |
+
Ensures accurate JSON schema binding.
|
| 102 |
"""
|
| 103 |
system_prompt = """You are an expert in invoice data extraction.
|
| 104 |
Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
|
|
|
|
| 111 |
2. Tax Amount (if available)
|
| 112 |
3. Vendor GST (if available)
|
| 113 |
4. Vendor Name
|
| 114 |
+
5. Invoice Date (format: "DD-MMM-YYYY")
|
| 115 |
6. Total Amount (numeric)
|
| 116 |
+
7. Invoice Number (alpha-numeric)
|
| 117 |
8. Vendor Address
|
| 118 |
+
9. Invoice Currency
|
| 119 |
|
| 120 |
Ensure that:
|
| 121 |
- All extracted fields match the invoice.
|
| 122 |
- If any field is missing, return null instead of hallucinating data.
|
| 123 |
- Do not generate synthetic values—only extract real information from the image.
|
| 124 |
"""
|
| 125 |
+
|
| 126 |
base64_images = []
|
| 127 |
base64DataResp = []
|
| 128 |
extracted_text = ""
|
| 129 |
+
|
| 130 |
if content_type == "application/pdf":
|
| 131 |
try:
|
| 132 |
extracted_text = extract_pdf_text(file_data)
|
|
|
|
| 152 |
|
| 153 |
elif content_type.startswith("image/"):
|
| 154 |
# Handle direct image files
|
| 155 |
+
base64_img = base64.b64encode(file_data).decode('utf-8')
|
| 156 |
+
base64DataResp.append(f"data:{content_type};base64,{base64_img}")
|
| 157 |
+
base64_images.append(f"data:{content_type};base64,{base64_img}")
|
| 158 |
+
|
| 159 |
else:
|
| 160 |
return {"error": f"Unsupported file type: {content_type}"}
|
| 161 |
|
| 162 |
if extracted_text:
|
| 163 |
+
return {"extracted_text": extracted_text}, base64DataResp
|
| 164 |
|
| 165 |
# Prepare OpenAI request
|
| 166 |
openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
|