document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

827e9a8

verified ·

1 Parent(s): 48bd53f

Changes updated

Browse files

Files changed (1) hide show

app.py +9 -15

app.py CHANGED Viewed

@@ -30,18 +30,17 @@ logger = logging.getLogger(__name__)
 MONGODB_URI = os.getenv("MONGODB_URI")
 DATABASE_NAME = os.getenv("DATABASE_NAME")
 COLLECTION_NAME = os.getenv("COLLECTION_NAME")
-# use_gpu = False
-# output_dir = 'output'
 # Check if environment variables are set
 if not MONGODB_URI:
-    raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
 # Initialize MongoDB Connection
 db_client = MongoClient(MONGODB_URI)
 db = db_client[DATABASE_NAME]
 invoice_collection = db[COLLECTION_NAME]
 app = FastAPI(docs_url='/')
 use_gpu = False
@@ -82,7 +81,7 @@ def fetch_file_from_s3(file_key):
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Function to summarize text using OpenAI GPT
-def extract_invoice_data(file_data, content_type):
     system_prompt = "You are an expert in document data extraction."
     # Convert file to Base64
@@ -123,7 +122,7 @@ def extract_invoice_data(file_data, content_type):
         # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
-        #cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
@@ -136,7 +135,7 @@ def extract_invoice_data(file_data, content_type):
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
-#def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
     try:
         response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
@@ -144,7 +143,6 @@ def extract_invoice_data(file_data, content_type):
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
@@ -164,7 +162,6 @@ def extract_text_from_file(
     """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
             existing_document["_id"] = str(existing_document["_id"])
             return {
@@ -172,7 +169,7 @@ def extract_text_from_file(
                 "document": existing_document
             }
         # Fetch dynamic schema based on document type
-        schema_doc = invoice_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
@@ -182,12 +179,10 @@ def extract_text_from_file(
         # Retrieve file from S3 and determine content type
         content_type = get_content_type_from_s3(file_key)
-        # Extract and parse invoice data
         file_data, _ = fetch_file_from_s3(file_key)
         extracted_data = extract_invoice_data(file_data, content_type, json_schema)
-        # Store extracted data in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
@@ -196,7 +191,6 @@ def extract_text_from_file(
             "extracted_data": extracted_data
         }
-        # Insert document into MongoDB
         try:
             inserted_doc = invoice_collection.insert_one(document)
             document_id = str(inserted_doc.inserted_id)
@@ -208,7 +202,7 @@ def extract_text_from_file(
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
-            "entityrefkey":entity_ref_key,
             "extracted_data": extracted_data
         }

 MONGODB_URI = os.getenv("MONGODB_URI")
 DATABASE_NAME = os.getenv("DATABASE_NAME")
 COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+SCHEMA = os.getenv("SCHEMA")
 # Check if environment variables are set
 if not MONGODB_URI:
+    raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
 # Initialize MongoDB Connection
 db_client = MongoClient(MONGODB_URI)
 db = db_client[DATABASE_NAME]
 invoice_collection = db[COLLECTION_NAME]
+schema_collection = db[SCHEMA]
 app = FastAPI(docs_url='/')
 use_gpu = False
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Function to summarize text using OpenAI GPT
+def extract_invoice_data(file_data, content_type, json_schema):
     system_prompt = "You are an expert in document data extraction."
     # Convert file to Base64
         # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
+        cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
+def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
     try:
         response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
     """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
             existing_document["_id"] = str(existing_document["_id"])
             return {
                 "document": existing_document
             }
         # Fetch dynamic schema based on document type
+        schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
         # Retrieve file from S3 and determine content type
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
         extracted_data = extract_invoice_data(file_data, content_type, json_schema)
+        # Build document for insertion
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "extracted_data": extracted_data
         }
         try:
             inserted_doc = invoice_collection.insert_one(document)
             document_id = str(inserted_doc.inserted_id)
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
+            "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }