Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -230,6 +230,15 @@ def extract_invoice_data(file_data, content_type):
|
|
| 230 |
logger.error(f"Error in data extraction: {e}")
|
| 231 |
return {"error": str(e)}
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
# Dependency to check API Key
|
| 234 |
def verify_api_key(api_key: str = Header(...)):
|
| 235 |
if api_key != API_KEY:
|
|
@@ -257,18 +266,12 @@ def extract_text_from_file(
|
|
| 257 |
"document": existing_document
|
| 258 |
}
|
| 259 |
|
| 260 |
-
# Retrieve file from S3 and determine content type
|
| 261 |
-
content_type = get_content_type_from_s3(file_key)
|
| 262 |
-
|
| 263 |
-
# Extract text (Ensure Extraction function is implemented)
|
| 264 |
-
extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
|
| 265 |
|
| 266 |
-
#
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
if num_pages <= 2:
|
| 270 |
-
base64DataResp = convert_to_base64(file_key) # Implement this function
|
| 271 |
-
summary = generate_summary(extracted_text) # Implement this function
|
| 272 |
|
| 273 |
# Store extracted data in MongoDB
|
| 274 |
document = {
|
|
@@ -276,10 +279,7 @@ def extract_text_from_file(
|
|
| 276 |
"file_type": content_type,
|
| 277 |
"document_type": document_type,
|
| 278 |
"entityrefkey": entity_ref_key,
|
| 279 |
-
"
|
| 280 |
-
"base64DataResp": base64DataResp, # Only for small files
|
| 281 |
-
"extracted_text": extracted_text,
|
| 282 |
-
"summary": summary, # Only for small files
|
| 283 |
}
|
| 284 |
|
| 285 |
inserted_doc = invoice_collection.insert_one(document)
|
|
@@ -288,9 +288,8 @@ def extract_text_from_file(
|
|
| 288 |
return {
|
| 289 |
"message": "Document successfully stored in MongoDB",
|
| 290 |
"document_id": document_id,
|
| 291 |
-
"
|
| 292 |
-
"
|
| 293 |
-
"summary": summary if summary else "Skipped for large documents"
|
| 294 |
}
|
| 295 |
|
| 296 |
except Exception as e:
|
|
|
|
| 230 |
logger.error(f"Error in data extraction: {e}")
|
| 231 |
return {"error": str(e)}
|
| 232 |
|
| 233 |
+
#def get_content_type_from_s3(file_key):
|
| 234 |
+
"""Fetch the content type (MIME type) of a file stored in S3."""
|
| 235 |
+
try:
|
| 236 |
+
response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
| 237 |
+
return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
|
| 238 |
+
except Exception as e:
|
| 239 |
+
raise Exception(f"Failed to get content type from S3: {str(e)}")
|
| 240 |
+
|
| 241 |
+
|
| 242 |
# Dependency to check API Key
|
| 243 |
def verify_api_key(api_key: str = Header(...)):
|
| 244 |
if api_key != API_KEY:
|
|
|
|
| 266 |
"document": existing_document
|
| 267 |
}
|
| 268 |
|
| 269 |
+
# Retrieve file from S3 and determine content type
|
| 270 |
+
content_type = get_content_type_from_s3(file_key)
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
# Extract and parse invoice data
|
| 273 |
+
file_data, _ = fetch_file_from_s3(file_key)
|
| 274 |
+
extracted_data = extract_invoice_data(file_data, content_type)
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
# Store extracted data in MongoDB
|
| 277 |
document = {
|
|
|
|
| 279 |
"file_type": content_type,
|
| 280 |
"document_type": document_type,
|
| 281 |
"entityrefkey": entity_ref_key,
|
| 282 |
+
"extracted_data": extracted_data
|
|
|
|
|
|
|
|
|
|
| 283 |
}
|
| 284 |
|
| 285 |
inserted_doc = invoice_collection.insert_one(document)
|
|
|
|
| 288 |
return {
|
| 289 |
"message": "Document successfully stored in MongoDB",
|
| 290 |
"document_id": document_id,
|
| 291 |
+
"entityrefkey":entity_ref_key,
|
| 292 |
+
"extracted_data": extracted_data
|
|
|
|
| 293 |
}
|
| 294 |
|
| 295 |
except Exception as e:
|