vkumartr commited on
Commit
3d971ee
·
verified ·
1 Parent(s): 5c635bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -24
app.py CHANGED
@@ -230,14 +230,14 @@ def extract_invoice_data(file_data, content_type):
230
  logger.error(f"Error in data extraction: {e}")
231
  return {"error": str(e)}
232
 
233
- def extract_text_from_s3(file_key, content_type):
234
- return "Extracted text from file", 1 # Placeholder for real extraction logic
235
 
236
- def convert_to_base64(file_key):
237
- return "Base64 encoded data" # Placeholder
238
 
239
- def generate_summary(extracted_text):
240
- return "Summarized text" # Placeholder
241
 
242
  def get_content_type_from_s3(file_key):
243
  """Fetch the content type (MIME type) of a file stored in S3."""
@@ -275,18 +275,12 @@ def extract_text_from_file(
275
  "document": existing_document
276
  }
277
 
278
- # Retrieve file from S3 and determine content type (Ensure this step is implemented)
279
- content_type = get_content_type_from_s3(file_key) # Implement this function
280
 
281
- # Extract text (Ensure Extraction function is implemented)
282
- extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
283
-
284
- # Define values for small/large files
285
- base64DataResp = None
286
- summary = None
287
- if num_pages <= 2:
288
- base64DataResp = convert_to_base64(file_key) # Implement this function
289
- summary = generate_summary(extracted_text) # Implement this function
290
 
291
  # Store extracted data in MongoDB
292
  document = {
@@ -294,10 +288,7 @@ def extract_text_from_file(
294
  "file_type": content_type,
295
  "document_type": document_type,
296
  "entityrefkey": entity_ref_key,
297
- "num_pages": num_pages,
298
- "base64DataResp": base64DataResp, # Only for small files
299
- "extracted_text": extracted_text,
300
- "summary": summary, # Only for small files
301
  }
302
 
303
  inserted_doc = invoice_collection.insert_one(document)
@@ -306,9 +297,7 @@ def extract_text_from_file(
306
  return {
307
  "message": "Document successfully stored in MongoDB",
308
  "document_id": document_id,
309
- "file_key": file_key,
310
- "num_pages": num_pages,
311
- "summary": summary if summary else "Skipped for large documents"
312
  }
313
 
314
  except Exception as e:
@@ -318,6 +307,7 @@ def extract_text_from_file(
318
  "traceback": traceback.format_exc()
319
  }
320
  return {"error": error_details}
 
321
 
322
  # Serve the output folder as static files
323
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 
230
  logger.error(f"Error in data extraction: {e}")
231
  return {"error": str(e)}
232
 
233
+ # def extract_text_from_s3(file_key, content_type):
234
+ # return "Extracted text from file", 1 # Placeholder for real extraction logic
235
 
236
+ # def convert_to_base64(file_key):
237
+ # return "Base64 encoded data" # Placeholder
238
 
239
+ # def generate_summary(extracted_text):
240
+ # return "Summarized text" # Placeholder
241
 
242
  def get_content_type_from_s3(file_key):
243
  """Fetch the content type (MIME type) of a file stored in S3."""
 
275
  "document": existing_document
276
  }
277
 
278
+ # Retrieve file from S3 and determine content type
279
+ content_type = get_content_type_from_s3(file_key)
280
 
281
+ # Extract and parse invoice data
282
+ file_data, _ = fetch_file_from_s3(file_key)
283
+ extracted_data = extract_invoice_data(file_data, content_type)
 
 
 
 
 
 
284
 
285
  # Store extracted data in MongoDB
286
  document = {
 
288
  "file_type": content_type,
289
  "document_type": document_type,
290
  "entityrefkey": entity_ref_key,
291
+ "extracted_data": extracted_data
 
 
 
292
  }
293
 
294
  inserted_doc = invoice_collection.insert_one(document)
 
297
  return {
298
  "message": "Document successfully stored in MongoDB",
299
  "document_id": document_id,
300
+ "extracted_data": extracted_data
 
 
301
  }
302
 
303
  except Exception as e:
 
307
  "traceback": traceback.format_exc()
308
  }
309
  return {"error": error_details}
310
+
311
 
312
  # Serve the output folder as static files
313
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")