document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

35fd719

verified ·

1 Parent(s): b27007b

Newly added

Browse files

Files changed (1) hide show

app.py +31 -22

app.py CHANGED Viewed

@@ -94,20 +94,41 @@ def extract_pdf_text(file_data):
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
-    system_prompt = "You are an expert in document data extraction."
     # Convert file to Base64
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
-    base64dataresp = f"data:{content_type};base64,{base64_encoded}"
     if content_type == "application/pdf":
         extracted_text = extract_pdf_text(file_data)
         if not extracted_text:
             return {"error": "Failed to extract text from PDF"}, base64dataresp
-        return {"extracted_text": extracted_text}, base64dataresp  # Return plain text for PDFs
-    # Handle Images using OpenAI
     elif content_type.startswith("image/"):
         try:
             response = openai.ChatCompletion.create(
@@ -126,25 +147,13 @@ def extract_invoice_data(file_data, content_type, json_schema):
                         ]
                     }
                 ],
-                response_format={
-                    "type": "json_schema",
-                    "json_schema": json_schema
-                },
                 temperature=0.5,
                 max_tokens=16384
             )
-            # Clean and parse JSON output
-            content = response.choices[0].message.content.strip()
-            cleaned_content = content.strip().strip('```json').strip('```')
-            try:
-                parsed_content = json.loads(cleaned_content)
-                return parsed_content, base64dataresp  # Return extracted structured data
-            except json.JSONDecodeError as e:
-                logger.error(f"JSON Parse Error: {e}")
-                return None, base64dataresp
         except Exception as e:
             logger.error(f"Error in OpenAI image processing: {e}")
             return {"error": str(e)}, base64dataresp
@@ -176,7 +185,7 @@ def extract_text_from_file(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """Extract text from a PDF or Image stored in S3 and process it accordingly."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
@@ -186,7 +195,7 @@ def extract_text_from_file(
                 "document": existing_document
             }
-        # Fetch schema for the document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
@@ -199,7 +208,7 @@ def extract_text_from_file(
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
-        # Extract data from the document (PDF or Image)
         extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
         # Build and store document in MongoDB

 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
+    """
+    Extracts data from a PDF or image and returns structured JSON based on the provided schema.
+    """
+    system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
     # Convert file to Base64
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
+    base64dataresp = f"data:{content_type};base64,{base64_encoded}"
+    # Handle PDF Extraction & Format to JSON Schema
     if content_type == "application/pdf":
         extracted_text = extract_pdf_text(file_data)
         if not extracted_text:
             return {"error": "Failed to extract text from PDF"}, base64dataresp
+        try:
+            # Send extracted text to OpenAI for structured JSON conversion
+            response = openai.ChatCompletion.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": extracted_text}
+                ],
+                response_format={"type": "json_schema", "json_schema": json_schema},
+                temperature=0.5,
+                max_tokens=16384
+            )
+            parsed_content = json.loads(response.choices[0].message.content.strip())
+            return parsed_content, base64dataresp  # Return structured JSON
+        except Exception as e:
+            logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
+            return {"error": str(e)}, base64dataresp
+    # Handle Image Extraction using OpenAI Vision API
     elif content_type.startswith("image/"):
         try:
             response = openai.ChatCompletion.create(
                         ]
                     }
                 ],
+                response_format={"type": "json_schema", "json_schema": json_schema},
                 temperature=0.5,
                 max_tokens=16384
             )
+            parsed_content = json.loads(response.choices[0].message.content.strip())
+            return parsed_content, base64dataresp  # Return structured JSON
         except Exception as e:
             logger.error(f"Error in OpenAI image processing: {e}")
             return {"error": str(e)}, base64dataresp
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract structured data from a PDF or Image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
                 "document": existing_document
             }
+        # Fetch JSON schema for the document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
+        # Extract structured data from the document
         extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
         # Build and store document in MongoDB