document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 7, 2025

Commit

ae5ac49

verified ·

1 Parent(s): 49834bb

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -70

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import hashlib
 from enum import Enum
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
 from PIL import Image
 import io
 import fitz  # PyMuPDF for PDF handling
 import logging
@@ -95,71 +96,57 @@ def extract_pdf_text(file_data):
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
-    Extracts data from a PDF or image and returns structured JSON based on the provided schema.
     """
     system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
-    # Convert file to Base64
-    base64_encoded = base64.b64encode(file_data).decode('utf-8')
-    base64dataresp = f"data:{content_type};base64,{base64_encoded}"
-    # Handle PDF Extraction & Format to JSON Schema
     if content_type == "application/pdf":
-        extracted_text = extract_pdf_text(file_data)
-        if not extracted_text:
-            return {"error": "Failed to extract text from PDF"}, base64dataresp
         try:
-            # Send extracted text to OpenAI for structured JSON conversion
-            response = openai.ChatCompletion.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": extracted_text}
-                ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
-                temperature=0.5,
-                max_tokens=16384
-            )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
-        except Exception as e:
-            logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
-            return {"error": str(e)}, base64dataresp
-    # Handle Image Extraction using OpenAI Vision API
-    elif content_type.startswith("image/"):
-        try:
-            response = openai.ChatCompletion.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:{content_type};base64,{base64_encoded}"
-                                }
-                            }
-                        ]
-                    }
-                ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
-                temperature=0.5,
-                max_tokens=16384
-            )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
         except Exception as e:
-            logger.error(f"Error in OpenAI image processing: {e}")
-            return {"error": str(e)}, base64dataresp
     else:
-        raise ValueError(f"Unsupported content type: {content_type}")
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
@@ -185,7 +172,7 @@ def extract_text_from_file(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """Extract structured data from a PDF or Image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
@@ -209,38 +196,34 @@ def extract_text_from_file(
         file_data, _ = fetch_file_from_s3(file_key)
         # Extract structured data from the document
-        extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
-        # Build and store document in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
-            "base64dataResp": base64dataresp,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
-        try:
-            inserted_doc = invoice_collection.insert_one(document)
-            document_id = str(inserted_doc.inserted_id)
-            logger.info(f"Document inserted with ID: {document_id}")
-        except Exception as e:
-            logger.error(f"Error inserting document: {str(e)}")
-            raise HTTPException(status_code=500, detail="Error inserting document into MongoDB")
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64dataResp": base64dataresp,
             "extracted_data": extracted_data
         }
     except Exception as e:
         error_details = {
-            "error_type": type(e).__name__,
-            "error_message": str(e),
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}

 from enum import Enum
 from fastapi import FastAPI, Header, Query, Depends, HTTPException
 from PIL import Image
+from pdf2image import convert_from_bytes
 import io
 import fitz  # PyMuPDF for PDF handling
 import logging
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
+    Extracts data from a PDF (converted to images) or an image.
+    Only PDFs with 1 or 2 pages are allowed.
     """
     system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
+    base64_images = []
     if content_type == "application/pdf":
         try:
+            images = convert_from_bytes(file_data)  # Convert PDF to images
+            if len(images) > 2:
+                raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
+            for img in images[:2]:  # Convert up to 2 pages
+                img_byte_arr = io.BytesIO()
+                img.save(img_byte_arr, format="PNG")
+                base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                base64_images.append(f"data:image/png;base64,{base64_encoded}")
+            content_type = "image/png"
         except Exception as e:
+            logger.error(f"Error converting PDF to image: {e}")
+            return {"error": "Failed to process PDF"}, None
     else:
+        # Handle direct image files
+        base64_encoded = base64.b64encode(file_data).decode('utf-8')
+        base64_images.append(f"data:{content_type};base64,{base64_encoded}")
+    # Prepare OpenAI request
+    openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": openai_content}
+            ],
+            response_format={"type": "json_schema", "json_schema": json_schema},
+            temperature=0.5,
+            max_tokens=16384
+        )
+        parsed_content = json.loads(response.choices[0].message.content.strip())
+        return parsed_content, base64_images
+    except Exception as e:
+        logger.error(f"Error in OpenAI processing: {e}")
+        return {"error": str(e)}, base64_images
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract structured data from a PDF or image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
         file_data, _ = fetch_file_from_s3(file_key)
         # Extract structured data from the document
+        extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
+        # Store document in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
+            "base64_images": base64_images,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
+        inserted_doc = invoice_collection.insert_one(document)
+        document_id = str(inserted_doc.inserted_id)
+        logger.info(f"Document inserted with ID: {document_id}")
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
+            "base64_images": base64_images,
             "extracted_data": extracted_data
         }
     except Exception as e:
         error_details = {
+            "error_type": type(e).__name__,
+            "error_message": str(e),
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}