vkumartr commited on
Commit
dd8eaf2
·
verified ·
1 Parent(s): 57f0216

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -14
app.py CHANGED
@@ -84,7 +84,6 @@ def fetch_file_from_s3(file_key):
84
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
85
 
86
 
87
- # Function to summarize text using OpenAI GPT
88
  # Updated extraction function that handles PDF and image files differently
89
  def extract_invoice_data(file_data, content_type, json_schema):
90
  """
@@ -92,14 +91,24 @@ def extract_invoice_data(file_data, content_type, json_schema):
92
  For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
93
  """
94
  system_prompt = "You are an expert in document data extraction."
 
 
 
95
 
96
  if content_type == "application/pdf":
97
  # Use PyMuPDF to extract text directly from the PDF
98
  try:
99
  doc = fitz.open(stream=file_data, filetype="pdf")
 
 
 
 
 
 
100
  extracted_text = ""
101
  for page in doc:
102
  extracted_text += page.get_text()
 
103
  except Exception as e:
104
  logger.error(f"Error extracting text from PDF: {e}")
105
  raise
@@ -112,18 +121,35 @@ def extract_invoice_data(file_data, content_type, json_schema):
112
  )
113
 
114
  elif content_type.startswith("image/"):
115
- # For images, encode as Base64 and pass to OpenAI
116
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
117
- # In this example we assume the model accepts image inputs via a Base64 data URL.
118
- # (This requires access to a multimodal model.)
119
- prompt = (
120
- f"Extract the invoice data from the following image. "
121
- f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
122
- f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
123
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  else:
125
  raise ValueError(f"Unsupported content type: {content_type}")
126
 
 
127
  try:
128
  response = openai.ChatCompletion.create(
129
  model="gpt-4o-mini",
@@ -136,15 +162,15 @@ def extract_invoice_data(file_data, content_type, json_schema):
136
  )
137
 
138
  content = response.choices[0].message.content.strip()
139
-
140
- # Clean and parse JSON output (remove markdown formatting if present)
141
  cleaned_content = content.strip().strip('```json').strip('```')
 
142
  try:
143
  parsed_content = json.loads(cleaned_content)
144
- return parsed_content
 
145
  except json.JSONDecodeError as e:
146
  logger.error(f"JSON Parse Error: {e}")
147
- return None
148
 
149
  except Exception as e:
150
  logger.error(f"Error in data extraction: {e}")
 
84
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
85
 
86
 
 
87
  # Updated extraction function that handles PDF and image files differently
88
  def extract_invoice_data(file_data, content_type, json_schema):
89
  """
 
91
  For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
92
  """
93
  system_prompt = "You are an expert in document data extraction."
94
+ base64_encoded_images = [] # To store Base64-encoded image data
95
+
96
+ extracted_data = {}
97
 
98
  if content_type == "application/pdf":
99
  # Use PyMuPDF to extract text directly from the PDF
100
  try:
101
  doc = fitz.open(stream=file_data, filetype="pdf")
102
+ num_pages = doc.page_count
103
+
104
+ # Check if the number of pages exceeds 2
105
+ if num_pages > 2:
106
+ raise ValueError("The PDF contains more than 2 pages, extraction not supported.")
107
+
108
  extracted_text = ""
109
  for page in doc:
110
  extracted_text += page.get_text()
111
+
112
  except Exception as e:
113
  logger.error(f"Error extracting text from PDF: {e}")
114
  raise
 
121
  )
122
 
123
  elif content_type.startswith("image/"):
124
+ # For images, determine if more than 2 images are provided
125
+ try:
126
+ img = Image.open(io.BytesIO(file_data)) # Open the image file
127
+ num_images = img.n_frames # Get number of images (pages in the image file)
128
+
129
+ if num_images > 2:
130
+ raise ValueError("The image file contains more than 2 pages, extraction not supported.")
131
+
132
+ # Process each image page if there are 1 or 2 pages
133
+ for page_num in range(num_images):
134
+ img.seek(page_num) # Move to the current page
135
+ img_bytes = io.BytesIO()
136
+ img.save(img_bytes, format="PNG") # Save each page as a PNG image in memory
137
+ base64_encoded = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
138
+ base64_encoded_images.append(base64_encoded)
139
+
140
+ # Build a prompt containing the image data for OpenAI
141
+ prompt = f"Extract the invoice data from the following images (Base64 encoded). Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
142
+ for base64_image in base64_encoded_images:
143
+ prompt += f"Image Data URL: data:{content_type};base64,{base64_image}\n"
144
+
145
+ except Exception as e:
146
+ logger.error(f"Error handling images: {e}")
147
+ raise
148
+
149
  else:
150
  raise ValueError(f"Unsupported content type: {content_type}")
151
 
152
+ # Send request to OpenAI for data extraction
153
  try:
154
  response = openai.ChatCompletion.create(
155
  model="gpt-4o-mini",
 
162
  )
163
 
164
  content = response.choices[0].message.content.strip()
 
 
165
  cleaned_content = content.strip().strip('```json').strip('```')
166
+
167
  try:
168
  parsed_content = json.loads(cleaned_content)
169
+ extracted_data["extracted_json"] = parsed_content # Store the parsed JSON data
170
+ return extracted_data
171
  except json.JSONDecodeError as e:
172
  logger.error(f"JSON Parse Error: {e}")
173
+ return {"error": f"JSON Parse Error: {str(e)}"}
174
 
175
  except Exception as e:
176
  logger.error(f"Error in data extraction: {e}")