Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -230,14 +230,14 @@ def extract_invoice_data(file_data, content_type):
|
|
| 230 |
logger.error(f"Error in data extraction: {e}")
|
| 231 |
return {"error": str(e)}
|
| 232 |
|
| 233 |
-
def extract_text_from_s3(file_key, content_type):
|
| 234 |
-
|
| 235 |
|
| 236 |
-
def convert_to_base64(file_key):
|
| 237 |
-
|
| 238 |
|
| 239 |
-
def generate_summary(extracted_text):
|
| 240 |
-
|
| 241 |
|
| 242 |
def get_content_type_from_s3(file_key):
|
| 243 |
"""Fetch the content type (MIME type) of a file stored in S3."""
|
|
@@ -275,18 +275,12 @@ def extract_text_from_file(
|
|
| 275 |
"document": existing_document
|
| 276 |
}
|
| 277 |
|
| 278 |
-
# Retrieve file from S3 and determine content type
|
| 279 |
-
content_type = get_content_type_from_s3(file_key)
|
| 280 |
|
| 281 |
-
# Extract
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
# Define values for small/large files
|
| 285 |
-
base64DataResp = None
|
| 286 |
-
summary = None
|
| 287 |
-
if num_pages <= 2:
|
| 288 |
-
base64DataResp = convert_to_base64(file_key) # Implement this function
|
| 289 |
-
summary = generate_summary(extracted_text) # Implement this function
|
| 290 |
|
| 291 |
# Store extracted data in MongoDB
|
| 292 |
document = {
|
|
@@ -294,10 +288,7 @@ def extract_text_from_file(
|
|
| 294 |
"file_type": content_type,
|
| 295 |
"document_type": document_type,
|
| 296 |
"entityrefkey": entity_ref_key,
|
| 297 |
-
"
|
| 298 |
-
"base64DataResp": base64DataResp, # Only for small files
|
| 299 |
-
"extracted_text": extracted_text,
|
| 300 |
-
"summary": summary, # Only for small files
|
| 301 |
}
|
| 302 |
|
| 303 |
inserted_doc = invoice_collection.insert_one(document)
|
|
@@ -306,9 +297,7 @@ def extract_text_from_file(
|
|
| 306 |
return {
|
| 307 |
"message": "Document successfully stored in MongoDB",
|
| 308 |
"document_id": document_id,
|
| 309 |
-
"
|
| 310 |
-
"num_pages": num_pages,
|
| 311 |
-
"summary": summary if summary else "Skipped for large documents"
|
| 312 |
}
|
| 313 |
|
| 314 |
except Exception as e:
|
|
@@ -318,6 +307,7 @@ def extract_text_from_file(
|
|
| 318 |
"traceback": traceback.format_exc()
|
| 319 |
}
|
| 320 |
return {"error": error_details}
|
|
|
|
| 321 |
|
| 322 |
# Serve the output folder as static files
|
| 323 |
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
|
|
|
|
| 230 |
logger.error(f"Error in data extraction: {e}")
|
| 231 |
return {"error": str(e)}
|
| 232 |
|
| 233 |
+
# def extract_text_from_s3(file_key, content_type):
|
| 234 |
+
# return "Extracted text from file", 1 # Placeholder for real extraction logic
|
| 235 |
|
| 236 |
+
# def convert_to_base64(file_key):
|
| 237 |
+
# return "Base64 encoded data" # Placeholder
|
| 238 |
|
| 239 |
+
# def generate_summary(extracted_text):
|
| 240 |
+
# return "Summarized text" # Placeholder
|
| 241 |
|
| 242 |
def get_content_type_from_s3(file_key):
|
| 243 |
"""Fetch the content type (MIME type) of a file stored in S3."""
|
|
|
|
| 275 |
"document": existing_document
|
| 276 |
}
|
| 277 |
|
| 278 |
+
# Retrieve file from S3 and determine content type
|
| 279 |
+
content_type = get_content_type_from_s3(file_key)
|
| 280 |
|
| 281 |
+
# Extract and parse invoice data
|
| 282 |
+
file_data, _ = fetch_file_from_s3(file_key)
|
| 283 |
+
extracted_data = extract_invoice_data(file_data, content_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
# Store extracted data in MongoDB
|
| 286 |
document = {
|
|
|
|
| 288 |
"file_type": content_type,
|
| 289 |
"document_type": document_type,
|
| 290 |
"entityrefkey": entity_ref_key,
|
| 291 |
+
"extracted_data": extracted_data
|
|
|
|
|
|
|
|
|
|
| 292 |
}
|
| 293 |
|
| 294 |
inserted_doc = invoice_collection.insert_one(document)
|
|
|
|
| 297 |
return {
|
| 298 |
"message": "Document successfully stored in MongoDB",
|
| 299 |
"document_id": document_id,
|
| 300 |
+
"extracted_data": extracted_data
|
|
|
|
|
|
|
| 301 |
}
|
| 302 |
|
| 303 |
except Exception as e:
|
|
|
|
| 307 |
"traceback": traceback.format_exc()
|
| 308 |
}
|
| 309 |
return {"error": error_details}
|
| 310 |
+
|
| 311 |
|
| 312 |
# Serve the output folder as static files
|
| 313 |
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
|