document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 7, 2025

Commit

4fe40a7

verified ·

1 Parent(s): 4f9f527

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -110

app.py CHANGED Viewed

@@ -1,23 +1,18 @@
 import uvicorn
 from fastapi.staticfiles import StaticFiles
-import hashlib
-from enum import Enum
-from fastapi import FastAPI, Header, Query, Depends, HTTPException
-from PIL import Image
 import io
-import fitz  # PyMuPDF for PDF handling
 import logging
-from pymongo import MongoClient
 import boto3
 import openai
 import os
-import traceback  # For detailed traceback of errors
-import re
 import json
-from dotenv import load_dotenv
 import base64
-from bson.objectid import ObjectId
 db_client = None
 load_dotenv()
@@ -32,19 +27,15 @@ DATABASE_NAME = os.getenv("DATABASE_NAME")
 COLLECTION_NAME = os.getenv("COLLECTION_NAME")
 SCHEMA = os.getenv("SCHEMA")
-# Check if environment variables are set
 if not MONGODB_URI:
     raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
-# Initialize MongoDB Connection
 db_client = MongoClient(MONGODB_URI)
 db = db_client[DATABASE_NAME]
 invoice_collection = db[COLLECTION_NAME]
 schema_collection = db[SCHEMA]
 app = FastAPI(docs_url='/')
-use_gpu = False
-output_dir = 'output'
 @app.on_event("startup")
 def startup_db():
@@ -63,114 +54,86 @@ S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 # OpenAI Configuration
 openai.api_key = os.getenv("OPENAI_API_KEY")
-# S3 Client
 s3_client = boto3.client(
     's3',
     aws_access_key_id=AWS_ACCESS_KEY,
     aws_secret_access_key=AWS_SECRET_KEY
 )
-# Function to fetch file from S3
 def fetch_file_from_s3(file_key):
     try:
         response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
-        content_type = response['ContentType']  # Retrieve MIME type
         file_data = response['Body'].read()
-        return file_data, content_type  # Return file data as BytesIO
     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
-def extract_pdf_text(file_data):
-    """
-    Extracts text from a PDF file using PyMuPDF (fitz).
-    """
-    try:
-        pdf_document = fitz.open(stream=file_data, filetype="pdf")
-        text = "\n".join([page.get_text("text") for page in pdf_document])
-        return text
-    except Exception as e:
-        logger.error(f"PDF Extraction Error: {e}")
-        return None
-# Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     """
-    Extracts data from a PDF or image and returns structured JSON based on the provided schema.
     """
     system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
-    # Convert file to Base64
-    base64_encoded = base64.b64encode(file_data).decode('utf-8')
-    base64dataresp = f"data:{content_type};base64,{base64_encoded}"
-    # Handle PDF Extraction & Format to JSON Schema
     if content_type == "application/pdf":
-        extracted_text = extract_pdf_text(file_data)
-        if not extracted_text:
-            return {"error": "Failed to extract text from PDF"}, base64dataresp
         try:
-            # Send extracted text to OpenAI for structured JSON conversion
-            response = openai.ChatCompletion.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": extracted_text}
-                ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
-                temperature=0.5,
-                max_tokens=16384
-            )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
-        except Exception as e:
-            logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
-            return {"error": str(e)}, base64dataresp
-    # Handle Image Extraction using OpenAI Vision API
-    elif content_type.startswith("image/"):
-        try:
-            response = openai.ChatCompletion.create(
-                model="gpt-4o-mini",
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:{content_type};base64,{base64_encoded}"
-                                }
-                            }
-                        ]
-                    }
-                ],
-                response_format={"type": "json_schema", "json_schema": json_schema},
-                temperature=0.5,
-                max_tokens=16384
-            )
-            parsed_content = json.loads(response.choices[0].message.content.strip())
-            return parsed_content, base64dataresp  # Return structured JSON
         except Exception as e:
-            logger.error(f"Error in OpenAI image processing: {e}")
-            return {"error": str(e)}, base64dataresp
     else:
-        raise ValueError(f"Unsupported content type: {content_type}")
 def get_content_type_from_s3(file_key):
-    """Fetch the content type (MIME type) of a file stored in S3."""
     try:
         response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
-        return response.get('ContentType', 'application/octet-stream')  # Default to binary if not found
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
-# Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid API Key")
@@ -185,7 +148,7 @@ def extract_text_from_file(
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
-    """Extract structured data from a PDF or Image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
@@ -209,43 +172,34 @@ def extract_text_from_file(
         file_data, _ = fetch_file_from_s3(file_key)
         # Extract structured data from the document
-        extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
-        # Build and store document in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
-            "base64dataResp": base64dataresp,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
-        try:
-            inserted_doc = invoice_collection.insert_one(document)
-            document_id = str(inserted_doc.inserted_id)
-            logger.info(f"Document inserted with ID: {document_id}")
-        except Exception as e:
-            logger.error(f"Error inserting document: {str(e)}")
-            raise HTTPException(status_code=500, detail="Error inserting document into MongoDB")
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
-            "base64dataResp": base64dataresp,
             "extracted_data": extracted_data
         }
     except Exception as e:
-        error_details = {
-            "error_type": type(e).__name__,
-            "error_message": str(e),
-            "traceback": traceback.format_exc()
-        }
         return {"error": error_details}
-# Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 if __name__ == '__main__':

 import uvicorn
 from fastapi.staticfiles import StaticFiles
 import io
 import logging
+import fitz  # PyMuPDF for PDF handling
 import boto3
 import openai
 import os
+import traceback
 import json
 import base64
+from pdf2image import convert_from_bytes
+from fastapi import FastAPI, Header, Query, Depends, HTTPException
+from pymongo import MongoClient
+from dotenv import load_dotenv
 db_client = None
 load_dotenv()
 COLLECTION_NAME = os.getenv("COLLECTION_NAME")
 SCHEMA = os.getenv("SCHEMA")
 if not MONGODB_URI:
     raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
 db_client = MongoClient(MONGODB_URI)
 db = db_client[DATABASE_NAME]
 invoice_collection = db[COLLECTION_NAME]
 schema_collection = db[SCHEMA]
 app = FastAPI(docs_url='/')
 @app.on_event("startup")
 def startup_db():
 # OpenAI Configuration
 openai.api_key = os.getenv("OPENAI_API_KEY")
 s3_client = boto3.client(
     's3',
     aws_access_key_id=AWS_ACCESS_KEY,
     aws_secret_access_key=AWS_SECRET_KEY
 )
 def fetch_file_from_s3(file_key):
+    """Retrieve file from S3"""
     try:
         response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
+        content_type = response['ContentType']
         file_data = response['Body'].read()
+        return file_data, content_type
     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 def extract_invoice_data(file_data, content_type, json_schema):
     """
+    Extracts data from a PDF (converted to images) or an image.
+    Only PDFs with 1 or 2 pages are allowed.
     """
     system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
+    base64_images = []
     if content_type == "application/pdf":
         try:
+            images = convert_from_bytes(file_data)  # Convert PDF to images
+            if len(images) > 2:
+                raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
+            for img in images[:2]:  # Convert up to 2 pages
+                img_byte_arr = io.BytesIO()
+                img.save(img_byte_arr, format="PNG")
+                base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                base64_images.append(f"data:image/png;base64,{base64_encoded}")
+            content_type = "image/png"
         except Exception as e:
+            logger.error(f"Error converting PDF to image: {e}")
+            return {"error": "Failed to process PDF"}, None
     else:
+        # Handle direct image files
+        base64_encoded = base64.b64encode(file_data).decode('utf-8')
+        base64_images.append(f"data:{content_type};base64,{base64_encoded}")
+    # Prepare OpenAI request
+    openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": openai_content}
+            ],
+            response_format={"type": "json_schema", "json_schema": json_schema},
+            temperature=0.5,
+            max_tokens=16384
+        )
+        parsed_content = json.loads(response.choices[0].message.content.strip())
+        return parsed_content, base64_images
+    except Exception as e:
+        logger.error(f"Error in OpenAI processing: {e}")
+        return {"error": str(e)}, base64_images
 def get_content_type_from_s3(file_key):
+    """Fetch MIME type of a file from S3"""
     try:
         response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
+        return response.get('ContentType', 'application/octet-stream')
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
 def verify_api_key(api_key: str = Header(...)):
+    """Verify API Key"""
     if api_key != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid API Key")
     document_type: str = Query(..., description="Type of document"),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
+    """Extract structured data from a PDF or image stored in S3."""
     try:
         existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
         if existing_document:
         file_data, _ = fetch_file_from_s3(file_key)
         # Extract structured data from the document
+        extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
+        # Store document in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
+            "baseDataResp": base64_images,
             "entityrefkey": entity_ref_key,
             "extracted_data": extracted_data
         }
+        inserted_doc = invoice_collection.insert_one(document)
+        document_id = str(inserted_doc.inserted_id)
+        logger.info(f"Document inserted with ID: {document_id}")
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "entityrefkey": entity_ref_key,
+            "baseDataResp": base64_images,
             "extracted_data": extracted_data
         }
     except Exception as e:
+        error_details = {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc()}
         return {"error": error_details}
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 if __name__ == '__main__':