vkumartr commited on
Commit
7a87996
·
verified ·
1 Parent(s): 59409c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -101,7 +101,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
101
  """
102
  system_prompt = "You are an expert in document data extraction."
103
  base64_images = []
104
- base64DataResp = []
105
 
106
  if content_type == "application/pdf":
107
  try:
@@ -115,10 +115,10 @@ def extract_invoice_data(file_data, content_type, json_schema):
115
  img_byte_arr = io.BytesIO()
116
  img.save(img_byte_arr, format="PNG", dpi=(300, 300))
117
  base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
118
- base64_images.append(f"data:image/png;base64,{base64_encoded}")
119
 
120
  # Store all images as a single JSON object
121
- base64DataResp = json.dumps(base64_images)
122
 
123
  except Exception as e:
124
  logger.error(f"Error converting PDF to image: {e}")
@@ -128,7 +128,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
128
  # Handle direct image files
129
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
130
  base64_images.append(f"data:{content_type};base64,{base64_encoded}")
131
- base64DataResp = json.dumps(base64_images) # Store as a JSON object
132
 
133
  # Prepare OpenAI request
134
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
@@ -146,11 +146,11 @@ def extract_invoice_data(file_data, content_type, json_schema):
146
  )
147
 
148
  parsed_content = json.loads(response.choices[0].message.content.strip())
149
- return parsed_content, base64DataResp
150
 
151
  except Exception as e:
152
  logger.error(f"Error in OpenAI processing: {e}")
153
- return {"error": str(e)}, base64DataResp
154
 
155
 
156
  def get_content_type_from_s3(file_key):
@@ -201,14 +201,14 @@ def extract_text_from_file(
201
  file_data, _ = fetch_file_from_s3(file_key)
202
 
203
  # Extract structured data from the document
204
- extracted_data, base64DataResp = extract_invoice_data(file_data, content_type, json_schema)
205
 
206
  # Store document in MongoDB
207
  document = {
208
  "file_key": file_key,
209
  "file_type": content_type,
210
  "document_type": document_type,
211
- "base64DataResp": base64DataResp,
212
  "entityrefkey": entity_ref_key,
213
  "extracted_data": extracted_data
214
  }
@@ -221,7 +221,7 @@ def extract_text_from_file(
221
  "message": "Document successfully stored in MongoDB",
222
  "document_id": document_id,
223
  "entityrefkey": entity_ref_key,
224
- "base64DataResp": base64DataResp,
225
  "extracted_data": extracted_data
226
  }
227
 
 
101
  """
102
  system_prompt = "You are an expert in document data extraction."
103
  base64_images = []
104
+ #base64DataResp = []
105
 
106
  if content_type == "application/pdf":
107
  try:
 
115
  img_byte_arr = io.BytesIO()
116
  img.save(img_byte_arr, format="PNG", dpi=(300, 300))
117
  base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
118
+ base64_images.append(f"data:{content_type};base64,{base64_encoded}")
119
 
120
  # Store all images as a single JSON object
121
+ #base64DataResp = json.dumps(base64_images)
122
 
123
  except Exception as e:
124
  logger.error(f"Error converting PDF to image: {e}")
 
128
  # Handle direct image files
129
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
130
  base64_images.append(f"data:{content_type};base64,{base64_encoded}")
131
+ #base64DataResp = json.dumps(base64_images) # Store as a JSON object
132
 
133
  # Prepare OpenAI request
134
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
 
146
  )
147
 
148
  parsed_content = json.loads(response.choices[0].message.content.strip())
149
+ return parsed_content, base64_images
150
 
151
  except Exception as e:
152
  logger.error(f"Error in OpenAI processing: {e}")
153
+ return {"error": str(e)}, base64_images
154
 
155
 
156
  def get_content_type_from_s3(file_key):
 
201
  file_data, _ = fetch_file_from_s3(file_key)
202
 
203
  # Extract structured data from the document
204
+ extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
205
 
206
  # Store document in MongoDB
207
  document = {
208
  "file_key": file_key,
209
  "file_type": content_type,
210
  "document_type": document_type,
211
+ "base64DataResp": base64_images,
212
  "entityrefkey": entity_ref_key,
213
  "extracted_data": extracted_data
214
  }
 
221
  "message": "Document successfully stored in MongoDB",
222
  "document_id": document_id,
223
  "entityrefkey": entity_ref_key,
224
+ "base64DataResp": base64_images,
225
  "extracted_data": extracted_data
226
  }
227