vkumartr commited on
Commit
851fdc1
·
verified ·
1 Parent(s): b6edac7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -24
app.py CHANGED
@@ -81,17 +81,42 @@ def fetch_file_from_s3(file_key):
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
  # Function to summarize text using OpenAI GPT
 
84
  def extract_invoice_data(file_data, content_type, json_schema):
 
 
 
 
85
  system_prompt = "You are an expert in document data extraction."
86
 
87
- # Convert file to Base64
88
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # Determine the correct MIME type for OpenAI
91
- if content_type.startswith("image/"):
92
- mime_type = content_type # e.g., image/png, image/jpeg
93
- elif content_type == "application/pdf":
94
- mime_type = "application/pdf"
 
 
 
 
 
95
  else:
96
  raise ValueError(f"Unsupported content type: {content_type}")
97
 
@@ -100,30 +125,16 @@ def extract_invoice_data(file_data, content_type, json_schema):
100
  model="gpt-4o-mini",
101
  messages=[
102
  {"role": "system", "content": system_prompt},
103
- {
104
- "role": "user",
105
- "content": [
106
- {
107
- "type": "image_url",
108
- "image_url": {
109
- "url": f"data:{mime_type};base64,{base64_encoded}"
110
- }
111
- }
112
- ]
113
- }
114
  ],
115
- response_format={
116
- "type": "json_schema",
117
- "json_schema": json_schema
118
- },
119
  temperature=0.5,
120
  max_tokens=16384
121
  )
122
 
123
- # Clean and parse JSON output
124
  content = response.choices[0].message.content.strip()
125
- cleaned_content = content.strip().strip('```json').strip('```')
126
 
 
 
127
  try:
128
  parsed_content = json.loads(cleaned_content)
129
  return parsed_content
 
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
  # Function to summarize text using OpenAI GPT
84
+ # Updated extraction function that handles PDF and image files differently
85
  def extract_invoice_data(file_data, content_type, json_schema):
86
+ """
87
+ For PDFs: Extract the embedded text using PyMuPDF (no OCR involved)
88
+ For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
89
+ """
90
  system_prompt = "You are an expert in document data extraction."
91
 
92
+ if content_type == "application/pdf":
93
+ # Use PyMuPDF to extract text directly from the PDF
94
+ try:
95
+ doc = fitz.open(stream=file_data, filetype="pdf")
96
+ extracted_text = ""
97
+ for page in doc:
98
+ extracted_text += page.get_text()
99
+ except Exception as e:
100
+ logger.error(f"Error extracting text from PDF: {e}")
101
+ raise
102
+
103
+ # Build a prompt containing the extracted text and the schema
104
+ prompt = (
105
+ f"Extract the invoice data from the following PDF text. "
106
+ f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
107
+ f"PDF Text:\n{extracted_text}"
108
+ )
109
 
110
+ elif content_type.startswith("image/"):
111
+ # For images, encode as Base64 and pass to OpenAI
112
+ base64_encoded = base64.b64encode(file_data).decode('utf-8')
113
+ # In this example we assume the model accepts image inputs via a Base64 data URL.
114
+ # (This requires access to a multimodal model.)
115
+ prompt = (
116
+ f"Extract the invoice data from the following image. "
117
+ f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
118
+ f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
119
+ )
120
  else:
121
  raise ValueError(f"Unsupported content type: {content_type}")
122
 
 
125
  model="gpt-4o-mini",
126
  messages=[
127
  {"role": "system", "content": system_prompt},
128
+ {"role": "user", "content": prompt},
 
 
 
 
 
 
 
 
 
 
129
  ],
 
 
 
 
130
  temperature=0.5,
131
  max_tokens=16384
132
  )
133
 
 
134
  content = response.choices[0].message.content.strip()
 
135
 
136
+ # Clean and parse JSON output (remove markdown formatting if present)
137
+ cleaned_content = content.strip().strip('```json').strip('```')
138
  try:
139
  parsed_content = json.loads(cleaned_content)
140
  return parsed_content