document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

09424c9

verified ·

1 Parent(s): 35fd719

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -29

app.py CHANGED Viewed

@@ -95,40 +95,24 @@ def extract_pdf_text(file_data):
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
-    Extracts data from a PDF or image and returns structured JSON based on the provided schema.
     """
-    system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
     # Convert file to Base64
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
     base64dataresp = f"data:{content_type};base64,{base64_encoded}"
-    # Handle PDF Extraction & Format to JSON Schema
     if content_type == "application/pdf":
         extracted_text = extract_pdf_text(file_data)
         if not extracted_text:
             return {"error": "Failed to extract text from PDF"}, base64dataresp
-        try:
-            # Send extracted text to OpenAI for structured JSON conversion
-            response = openai.ChatCompletion.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": extracted_text}
-                ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
-                temperature=0.5,
-                max_tokens=16384
-            )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
-        except Exception as e:
-            logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
-            return {"error": str(e)}, base64dataresp
-    # Handle Image Extraction using OpenAI Vision API
     elif content_type.startswith("image/"):
         try:
             response = openai.ChatCompletion.create(
@@ -147,13 +131,25 @@ def extract_invoice_data(file_data, content_type, json_schema):
                         ]
                     }
                 ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
                 temperature=0.5,
                 max_tokens=16384
             )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
         except Exception as e:
             logger.error(f"Error in OpenAI image processing: {e}")
             return {"error": str(e)}, base64dataresp
@@ -185,7 +181,7 @@ def extract_text_from_file(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """Extract structured data from a PDF or Image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
@@ -195,7 +191,7 @@ def extract_text_from_file(
                 "document": existing_document
             }
-        # Fetch JSON schema for the document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
@@ -208,7 +204,7 @@ def extract_text_from_file(
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
-        # Extract structured data from the document
         extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
         # Build and store document in MongoDB
@@ -233,7 +229,6 @@ def extract_text_from_file(
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64dataResp": base64dataresp,
             "extracted_data": extracted_data
         }

 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
+    Handles both PDF text extraction (PyMuPDF) and Image OCR using OpenAI GPT.
+    Returns extracted data along with the base64 representation.
     """
+    system_prompt = "You are an expert in document data extraction."
     # Convert file to Base64
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
     base64dataresp = f"data:{content_type};base64,{base64_encoded}"
+    # Handle PDF separately
     if content_type == "application/pdf":
         extracted_text = extract_pdf_text(file_data)
         if not extracted_text:
             return {"error": "Failed to extract text from PDF"}, base64dataresp
+        return {"extracted_text": extracted_text}, base64dataresp  # Return plain text for PDFs
+    # Handle Images using OpenAI
     elif content_type.startswith("image/"):
         try:
             response = openai.ChatCompletion.create(
                         ]
                     }
                 ],
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": json_schema
+                },
                 temperature=0.5,
                 max_tokens=16384
             )
+            # Clean and parse JSON output
+            content = response.choices[0].message.content.strip()
+            cleaned_content = content.strip().strip('```json').strip('```')
+            try:
+                parsed_content = json.loads(cleaned_content)
+                return parsed_content, base64dataresp  # Return extracted structured data
+            except json.JSONDecodeError as e:
+                logger.error(f"JSON Parse Error: {e}")
+                return None, base64dataresp
         except Exception as e:
             logger.error(f"Error in OpenAI image processing: {e}")
             return {"error": str(e)}, base64dataresp
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract text from a PDF or Image stored in S3 and process it accordingly."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
                 "document": existing_document
             }
+        # Fetch schema for the document type
         schema_doc = schema_collection.find_one({"document_type": document_type})
         if not schema_doc:
             raise ValueError("No schema found for the given document type")
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
+        # Extract data from the document (PDF or Image)
         extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
         # Build and store document in MongoDB
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }