Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -81,17 +81,42 @@ def fetch_file_from_s3(file_key):
|
|
| 81 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 82 |
|
| 83 |
# Function to summarize text using OpenAI GPT
|
|
|
|
| 84 |
def extract_invoice_data(file_data, content_type, json_schema):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
system_prompt = "You are an expert in document data extraction."
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
else:
|
| 96 |
raise ValueError(f"Unsupported content type: {content_type}")
|
| 97 |
|
|
@@ -100,30 +125,16 @@ def extract_invoice_data(file_data, content_type, json_schema):
|
|
| 100 |
model="gpt-4o-mini",
|
| 101 |
messages=[
|
| 102 |
{"role": "system", "content": system_prompt},
|
| 103 |
-
{
|
| 104 |
-
"role": "user",
|
| 105 |
-
"content": [
|
| 106 |
-
{
|
| 107 |
-
"type": "image_url",
|
| 108 |
-
"image_url": {
|
| 109 |
-
"url": f"data:{mime_type};base64,{base64_encoded}"
|
| 110 |
-
}
|
| 111 |
-
}
|
| 112 |
-
]
|
| 113 |
-
}
|
| 114 |
],
|
| 115 |
-
response_format={
|
| 116 |
-
"type": "json_schema",
|
| 117 |
-
"json_schema": json_schema
|
| 118 |
-
},
|
| 119 |
temperature=0.5,
|
| 120 |
max_tokens=16384
|
| 121 |
)
|
| 122 |
|
| 123 |
-
# Clean and parse JSON output
|
| 124 |
content = response.choices[0].message.content.strip()
|
| 125 |
-
cleaned_content = content.strip().strip('```json').strip('```')
|
| 126 |
|
|
|
|
|
|
|
| 127 |
try:
|
| 128 |
parsed_content = json.loads(cleaned_content)
|
| 129 |
return parsed_content
|
|
|
|
| 81 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 82 |
|
| 83 |
# Function to summarize text using OpenAI GPT
|
| 84 |
+
# Updated extraction function that handles PDF and image files differently
|
| 85 |
def extract_invoice_data(file_data, content_type, json_schema):
|
| 86 |
+
"""
|
| 87 |
+
For PDFs: Extract the embedded text using PyMuPDF (no OCR involved)
|
| 88 |
+
For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
|
| 89 |
+
"""
|
| 90 |
system_prompt = "You are an expert in document data extraction."
|
| 91 |
|
| 92 |
+
if content_type == "application/pdf":
|
| 93 |
+
# Use PyMuPDF to extract text directly from the PDF
|
| 94 |
+
try:
|
| 95 |
+
doc = fitz.open(stream=file_data, filetype="pdf")
|
| 96 |
+
extracted_text = ""
|
| 97 |
+
for page in doc:
|
| 98 |
+
extracted_text += page.get_text()
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"Error extracting text from PDF: {e}")
|
| 101 |
+
raise
|
| 102 |
+
|
| 103 |
+
# Build a prompt containing the extracted text and the schema
|
| 104 |
+
prompt = (
|
| 105 |
+
f"Extract the invoice data from the following PDF text. "
|
| 106 |
+
f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
|
| 107 |
+
f"PDF Text:\n{extracted_text}"
|
| 108 |
+
)
|
| 109 |
|
| 110 |
+
elif content_type.startswith("image/"):
|
| 111 |
+
# For images, encode as Base64 and pass to OpenAI
|
| 112 |
+
base64_encoded = base64.b64encode(file_data).decode('utf-8')
|
| 113 |
+
# In this example we assume the model accepts image inputs via a Base64 data URL.
|
| 114 |
+
# (This requires access to a multimodal model.)
|
| 115 |
+
prompt = (
|
| 116 |
+
f"Extract the invoice data from the following image. "
|
| 117 |
+
f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
|
| 118 |
+
f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
|
| 119 |
+
)
|
| 120 |
else:
|
| 121 |
raise ValueError(f"Unsupported content type: {content_type}")
|
| 122 |
|
|
|
|
| 125 |
model="gpt-4o-mini",
|
| 126 |
messages=[
|
| 127 |
{"role": "system", "content": system_prompt},
|
| 128 |
+
{"role": "user", "content": prompt},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
temperature=0.5,
|
| 131 |
max_tokens=16384
|
| 132 |
)
|
| 133 |
|
|
|
|
| 134 |
content = response.choices[0].message.content.strip()
|
|
|
|
| 135 |
|
| 136 |
+
# Clean and parse JSON output (remove markdown formatting if present)
|
| 137 |
+
cleaned_content = content.strip().strip('```json').strip('```')
|
| 138 |
try:
|
| 139 |
parsed_content = json.loads(cleaned_content)
|
| 140 |
return parsed_content
|