vkumartr commited on
Commit
89d454b
·
verified ·
1 Parent(s): f66ab35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -18
app.py CHANGED
@@ -230,6 +230,15 @@ def extract_invoice_data(file_data, content_type):
230
  logger.error(f"Error in data extraction: {e}")
231
  return {"error": str(e)}
232
 
 
 
 
 
 
 
 
 
 
233
  # Dependency to check API Key
234
  def verify_api_key(api_key: str = Header(...)):
235
  if api_key != API_KEY:
@@ -257,18 +266,12 @@ def extract_text_from_file(
257
  "document": existing_document
258
  }
259
 
260
- # Retrieve file from S3 and determine content type (Ensure this step is implemented)
261
- content_type = get_content_type_from_s3(file_key) # Implement this function
262
-
263
- # Extract text (Ensure Extraction function is implemented)
264
- extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
265
 
266
- # Define values for small/large files
267
- base64DataResp = None
268
- summary = None
269
- if num_pages <= 2:
270
- base64DataResp = convert_to_base64(file_key) # Implement this function
271
- summary = generate_summary(extracted_text) # Implement this function
272
 
273
  # Store extracted data in MongoDB
274
  document = {
@@ -276,10 +279,7 @@ def extract_text_from_file(
276
  "file_type": content_type,
277
  "document_type": document_type,
278
  "entityrefkey": entity_ref_key,
279
- "num_pages": num_pages,
280
- "base64DataResp": base64DataResp, # Only for small files
281
- "extracted_text": extracted_text,
282
- "summary": summary, # Only for small files
283
  }
284
 
285
  inserted_doc = invoice_collection.insert_one(document)
@@ -288,9 +288,8 @@ def extract_text_from_file(
288
  return {
289
  "message": "Document successfully stored in MongoDB",
290
  "document_id": document_id,
291
- "file_key": file_key,
292
- "num_pages": num_pages,
293
- "summary": summary if summary else "Skipped for large documents"
294
  }
295
 
296
  except Exception as e:
 
230
  logger.error(f"Error in data extraction: {e}")
231
  return {"error": str(e)}
232
 
233
+ #def get_content_type_from_s3(file_key):
234
+ """Fetch the content type (MIME type) of a file stored in S3."""
235
+ try:
236
+ response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
237
+ return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
238
+ except Exception as e:
239
+ raise Exception(f"Failed to get content type from S3: {str(e)}")
240
+
241
+
242
  # Dependency to check API Key
243
  def verify_api_key(api_key: str = Header(...)):
244
  if api_key != API_KEY:
 
266
  "document": existing_document
267
  }
268
 
269
+ # Retrieve file from S3 and determine content type
270
+ content_type = get_content_type_from_s3(file_key)
 
 
 
271
 
272
+ # Extract and parse invoice data
273
+ file_data, _ = fetch_file_from_s3(file_key)
274
+ extracted_data = extract_invoice_data(file_data, content_type)
 
 
 
275
 
276
  # Store extracted data in MongoDB
277
  document = {
 
279
  "file_type": content_type,
280
  "document_type": document_type,
281
  "entityrefkey": entity_ref_key,
282
+ "extracted_data": extracted_data
 
 
 
283
  }
284
 
285
  inserted_doc = invoice_collection.insert_one(document)
 
288
  return {
289
  "message": "Document successfully stored in MongoDB",
290
  "document_id": document_id,
291
+ "entityrefkey":entity_ref_key,
292
+ "extracted_data": extracted_data
 
293
  }
294
 
295
  except Exception as e: