document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

f66ab35

verified ·

1 Parent(s): 8b0fe14

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -20

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import uvicorn
 from fastapi.staticfiles import StaticFiles
 import hashlib
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
 from PIL import Image
 import io
@@ -30,6 +31,9 @@ MONGODB_URI = os.getenv("MONGODB_URI")
 DATABASE_NAME = os.getenv("DATABASE_NAME")
 COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
 # Check if environment variables are set
 if not MONGODB_URI:
     raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
@@ -103,7 +107,7 @@ def extract_invoice_data(file_data, content_type):
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/{mime_type};base64,{base64_encoded}"
                             }
                         }
                     ]
@@ -213,7 +217,7 @@ def extract_invoice_data(file_data, content_type):
         # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
-        return json.loads(content)
         try:
             parsed_content = json.loads(cleaned_content)
@@ -226,15 +230,6 @@ def extract_invoice_data(file_data, content_type):
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
-def get_content_type_from_s3(file_key):
-    """Fetch the content type (MIME type) of a file stored in S3."""
-    try:
-        response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
-        return response.get('ContentType', 'application/octet-stream')  # Default to binary if not found
-    except Exception as e:
-        raise Exception(f"Failed to get content type from S3: {str(e)}")
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
@@ -262,12 +257,18 @@ def extract_text_from_file(
                 "document": existing_document
             }
-        # Retrieve file from S3 and determine content type
-        content_type = get_content_type_from_s3(file_key)
-        # Extract and parse invoice data
-        file_data, _ = fetch_file_from_s3(file_key)
-        extracted_data = extract_invoice_data(file_data, content_type)
         # Store extracted data in MongoDB
         document = {
@@ -275,7 +276,10 @@ def extract_text_from_file(
             "file_type": content_type,
             "document_type": document_type,
             "entityrefkey": entity_ref_key,
-            "extracted_data": extracted_data
         }
         inserted_doc = invoice_collection.insert_one(document)
@@ -284,8 +288,9 @@ def extract_text_from_file(
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
-            "entityrefkey":entity_ref_key,
-            "extracted_data": extracted_data
         }
     except Exception as e:
@@ -295,7 +300,6 @@ def extract_text_from_file(
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

 import uvicorn
 from fastapi.staticfiles import StaticFiles
 import hashlib
+from enum import Enum
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
 from PIL import Image
 import io
 DATABASE_NAME = os.getenv("DATABASE_NAME")
 COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
+# use_gpu = False
+# output_dir = 'output'
 # Check if environment variables are set
 if not MONGODB_URI:
     raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
                         {
                             "type": "image_url",
                             "image_url": {
+                                "url": f"data:{mime_type};base64,{base64_encoded}"
                             }
                         }
                     ]
         # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
+        #cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
                 "document": existing_document
             }
+        # Retrieve file from S3 and determine content type (Ensure this step is implemented)
+        content_type = get_content_type_from_s3(file_key)  # Implement this function
+        # Extract text (Ensure Extraction function is implemented)
+        extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
+        # Define values for small/large files
+        base64DataResp = None
+        summary = None
+        if num_pages <= 2:
+            base64DataResp = convert_to_base64(file_key)  # Implement this function
+            summary = generate_summary(extracted_text)  # Implement this function
         # Store extracted data in MongoDB
         document = {
             "file_type": content_type,
             "document_type": document_type,
             "entityrefkey": entity_ref_key,
+            "num_pages": num_pages,
+            "base64DataResp": base64DataResp,  # Only for small files
+            "extracted_text": extracted_text,
+            "summary": summary,  # Only for small files
         }
         inserted_doc = invoice_collection.insert_one(document)
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
+            "file_key": file_key,
+            "num_pages": num_pages,
+            "summary": summary if summary else "Skipped for large documents"
         }
     except Exception as e:
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")