document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

b27007b

verified ·

1 Parent(s): d11faed

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -8

app.py CHANGED Viewed

@@ -176,7 +176,7 @@ def extract_text_from_file(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
@@ -185,7 +185,8 @@ def extract_text_from_file(
                 "message": "Document Retrieved from MongoDB.",
                 "document": existing_document
             }
-        # Fetch dynamic schema based on document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
@@ -193,18 +194,20 @@ def extract_text_from_file(
         json_schema = schema_doc.get("json_schema")
         if not json_schema:
             raise ValueError("Schema is empty or not properly defined.")
-        # Retrieve file from S3 and determine content type
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
-        extracted_data,base64dataresp  = extract_invoice_data(file_data, content_type, json_schema)
-        # Build document for insertion
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
-            "base64dataResp":base64dataresp,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
@@ -221,7 +224,7 @@ def extract_text_from_file(
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64dataResp":base64dataresp,
             "extracted_data": extracted_data
         }

     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract text from a PDF or Image stored in S3 and process it accordingly."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
                 "message": "Document Retrieved from MongoDB.",
                 "document": existing_document
             }
+        # Fetch schema for the document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
         json_schema = schema_doc.get("json_schema")
         if not json_schema:
             raise ValueError("Schema is empty or not properly defined.")
+        # Retrieve file from S3
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
+        # Extract data from the document (PDF or Image)
+        extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
+        # Build and store document in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
+            "base64dataResp": base64dataresp,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
+            "base64dataResp": base64dataresp,
             "extracted_data": extracted_data
         }