vkumartr commited on
Commit
2e4e1c7
·
verified ·
1 Parent(s): 851fdc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -14
app.py CHANGED
@@ -80,7 +80,6 @@ def fetch_file_from_s3(file_key):
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
- # Function to summarize text using OpenAI GPT
84
  # Updated extraction function that handles PDF and image files differently
85
  def extract_invoice_data(file_data, content_type, json_schema):
86
  """
@@ -88,14 +87,27 @@ def extract_invoice_data(file_data, content_type, json_schema):
88
  For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
89
  """
90
  system_prompt = "You are an expert in document data extraction."
 
 
 
91
 
92
  if content_type == "application/pdf":
93
  # Use PyMuPDF to extract text directly from the PDF
94
  try:
95
  doc = fitz.open(stream=file_data, filetype="pdf")
 
 
 
 
 
 
96
  extracted_text = ""
97
  for page in doc:
98
  extracted_text += page.get_text()
 
 
 
 
99
  except Exception as e:
100
  logger.error(f"Error extracting text from PDF: {e}")
101
  raise
@@ -108,18 +120,38 @@ def extract_invoice_data(file_data, content_type, json_schema):
108
  )
109
 
110
  elif content_type.startswith("image/"):
111
- # For images, encode as Base64 and pass to OpenAI
112
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
113
- # In this example we assume the model accepts image inputs via a Base64 data URL.
114
- # (This requires access to a multimodal model.)
115
- prompt = (
116
- f"Extract the invoice data from the following image. "
117
- f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
118
- f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
  raise ValueError(f"Unsupported content type: {content_type}")
122
 
 
123
  try:
124
  response = openai.ChatCompletion.create(
125
  model="gpt-4o-mini",
@@ -132,15 +164,15 @@ def extract_invoice_data(file_data, content_type, json_schema):
132
  )
133
 
134
  content = response.choices[0].message.content.strip()
135
-
136
- # Clean and parse JSON output (remove markdown formatting if present)
137
  cleaned_content = content.strip().strip('```json').strip('```')
 
138
  try:
139
  parsed_content = json.loads(cleaned_content)
140
- return parsed_content
 
141
  except json.JSONDecodeError as e:
142
  logger.error(f"JSON Parse Error: {e}")
143
- return None
144
 
145
  except Exception as e:
146
  logger.error(f"Error in data extraction: {e}")
 
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
 
83
  # Updated extraction function that handles PDF and image files differently
84
  def extract_invoice_data(file_data, content_type, json_schema):
85
  """
 
87
  For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
88
  """
89
  system_prompt = "You are an expert in document data extraction."
90
+ base64_encoded_images = [] # To store Base64-encoded image data
91
+
92
+ extracted_data = {}
93
 
94
  if content_type == "application/pdf":
95
  # Use PyMuPDF to extract text directly from the PDF
96
  try:
97
  doc = fitz.open(stream=file_data, filetype="pdf")
98
+ num_pages = doc.page_count
99
+
100
+ # Check if the number of pages exceeds 2
101
+ if num_pages > 2:
102
+ raise ValueError("The PDF contains more than 2 pages, extraction not supported.")
103
+
104
  extracted_text = ""
105
  for page in doc:
106
  extracted_text += page.get_text()
107
+
108
+ # Store the extracted text in the dictionary
109
+ extracted_data["text"] = extracted_text
110
+
111
  except Exception as e:
112
  logger.error(f"Error extracting text from PDF: {e}")
113
  raise
 
120
  )
121
 
122
  elif content_type.startswith("image/"):
123
+ # For images, determine if more than 2 images are provided
124
+ try:
125
+ img = Image.open(io.BytesIO(file_data)) # Open the image file
126
+ num_images = img.n_frames # Get number of images (pages in the image file)
127
+
128
+ if num_images > 2:
129
+ raise ValueError("The image file contains more than 2 pages, extraction not supported.")
130
+
131
+ # Process each image page if there are 1 or 2 pages
132
+ for page_num in range(num_images):
133
+ img.seek(page_num) # Move to the current page
134
+ img_bytes = io.BytesIO()
135
+ img.save(img_bytes, format="PNG") # Save each page as a PNG image in memory
136
+ base64_encoded = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
137
+ base64_encoded_images.append(base64_encoded)
138
+
139
+ # Add Base64 image data to the extracted data dictionary
140
+ extracted_data["base64_images"] = base64_encoded_images
141
+
142
+ # Build a prompt containing the image data for OpenAI
143
+ prompt = f"Extract the invoice data from the following images (Base64 encoded). Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
144
+ for base64_image in base64_encoded_images:
145
+ prompt += f"Image Data URL: data:{content_type};base64,{base64_image}\n"
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error handling images: {e}")
149
+ raise
150
+
151
  else:
152
  raise ValueError(f"Unsupported content type: {content_type}")
153
 
154
+ # Send request to OpenAI for data extraction
155
  try:
156
  response = openai.ChatCompletion.create(
157
  model="gpt-4o-mini",
 
164
  )
165
 
166
  content = response.choices[0].message.content.strip()
 
 
167
  cleaned_content = content.strip().strip('```json').strip('```')
168
+
169
  try:
170
  parsed_content = json.loads(cleaned_content)
171
+ extracted_data["extracted_json"] = parsed_content # Store the parsed JSON data
172
+ return extracted_data
173
  except json.JSONDecodeError as e:
174
  logger.error(f"JSON Parse Error: {e}")
175
+ return {"error": f"JSON Parse Error: {str(e)}"}
176
 
177
  except Exception as e:
178
  logger.error(f"Error in data extraction: {e}")