document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

2125a91

verified ·

1 Parent(s): c4f2ca9

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -33

app.py CHANGED Viewed

@@ -46,6 +46,7 @@ app = FastAPI(docs_url='/')
 use_gpu = False
 output_dir = 'output'
 @app.on_event("startup")
 def startup_db():
     try:
@@ -54,6 +55,7 @@ def startup_db():
     except Exception as e:
         logger.error(f"MongoDB connection failed: {str(e)}")
 # AWS S3 Configuration
 API_KEY = os.getenv("API_KEY")
 AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
@@ -70,6 +72,7 @@ s3_client = boto3.client(
     aws_secret_access_key=AWS_SECRET_KEY
 )
 # Function to fetch file from S3
 def fetch_file_from_s3(file_key):
     try:
@@ -80,47 +83,84 @@ def fetch_file_from_s3(file_key):
     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
-# Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     system_prompt = "You are an expert in document data extraction."
-    # Convert file to Base64
-    base64_encoded = base64.b64encode(file_data).decode('utf-8')
-    # Determine the correct MIME type for OpenAI
-    if content_type.startswith("image/"):
-        mime_type = content_type  # e.g., image/png, image/jpeg
-    elif content_type == "application/pdf":
-        mime_type = "application/pdf"
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
     try:
         response = openai.ChatCompletion.create(
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": system_prompt},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:{mime_type};base64,{base64_encoded}"
-                            }
-                        }
-                    ]
-                }
             ],
-            response_format={
-                "type": "json_schema",
-                "json_schema": json_schema
-            },
             temperature=0.5,
             max_tokens=16384
         )
-        # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
         cleaned_content = content.strip().strip('```json').strip('```')
@@ -129,12 +169,13 @@ def extract_invoice_data(file_data, content_type, json_schema):
             return parsed_content
         except json.JSONDecodeError as e:
             logger.error(f"JSON Parse Error: {e}")
-            return None
     except Exception as e:
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
     try:
@@ -143,21 +184,24 @@ def get_content_type_from_s3(file_key):
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid API Key")
 @app.get("/")
 def read_root():
     return {"message": "Welcome to the Invoice Summarization API!"}
 @app.get("/ocr/extraction")
 def extract_text_from_file(
-    api_key: str = Depends(verify_api_key),
-    file_key: str = Query(..., description="S3 file key for the file"),
-    document_type: str = Query(..., description="Type of document"),
-    entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
     """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
@@ -175,9 +219,9 @@ def extract_text_from_file(
         json_schema = schema_doc.get("json_schema")
         if not json_schema:
-            raise ValueError("Schema is empty or not properly defined.")
-        # Retrieve file from S3 and determine content type
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
         extracted_data = extract_invoice_data(file_data, content_type, json_schema)
@@ -213,7 +257,8 @@ def extract_text_from_file(
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

 use_gpu = False
 output_dir = 'output'
 @app.on_event("startup")
 def startup_db():
     try:
     except Exception as e:
         logger.error(f"MongoDB connection failed: {str(e)}")
 # AWS S3 Configuration
 API_KEY = os.getenv("API_KEY")
 AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
     aws_secret_access_key=AWS_SECRET_KEY
 )
 # Function to fetch file from S3
 def fetch_file_from_s3(file_key):
     try:
     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
+# Updated extraction function that handles PDF and image files differently
 def extract_invoice_data(file_data, content_type, json_schema):
+    """
+    For PDFs: Extract the embedded text using PyMuPDF (no OCR involved)
+    For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
+    """
     system_prompt = "You are an expert in document data extraction."
+    base64_encoded_images = []  # To store Base64-encoded image data
+    extracted_data = {}
+    if content_type == "application/pdf":
+        # Use PyMuPDF to extract text directly from the PDF
+        try:
+            doc = fitz.open(stream=file_data, filetype="pdf")
+            num_pages = doc.page_count
+            # Check if the number of pages exceeds 2
+            if num_pages > 2:
+                raise ValueError("The PDF contains more than 2 pages, extraction not supported.")
+            extracted_text = ""
+            for page in doc:
+                extracted_text += page.get_text()
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF: {e}")
+            raise
+        # Build a prompt containing the extracted text and the schema
+        prompt = (
+            f"Extract the invoice data from the following PDF text. "
+            f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
+            f"PDF Text:\n{extracted_text}"
+        )
+    elif content_type.startswith("image/"):
+        # For images, determine if more than 2 images are provided
+        try:
+            img = Image.open(io.BytesIO(file_data))  # Open the image file
+            num_images = img.n_frames  # Get number of images (pages in the image file)
+            if num_images > 2:
+                raise ValueError("The image file contains more than 2 pages, extraction not supported.")
+            # Process each image page if there are 1 or 2 pages
+            for page_num in range(num_images):
+                img.seek(page_num)  # Move to the current page
+                img_bytes = io.BytesIO()
+                img.save(img_bytes, format="PNG")  # Save each page as a PNG image in memory
+                base64_encoded = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
+                base64_encoded_images.append(base64_encoded)
+            # Build a prompt containing the image data for OpenAI
+            prompt = f"Extract the invoice data from the following images (Base64 encoded). Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
+            for base64_image in base64_encoded_images:
+                prompt += f"Image Data URL: data:{content_type};base64,{base64_image}\n"
+        except Exception as e:
+            logger.error(f"Error handling images: {e}")
+            raise
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
+    # Send request to OpenAI for data extraction
     try:
         response = openai.ChatCompletion.create(
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
             ],
             temperature=0.5,
             max_tokens=16384
         )
         content = response.choices[0].message.content.strip()
         cleaned_content = content.strip().strip('```json').strip('```')
             return parsed_content
         except json.JSONDecodeError as e:
             logger.error(f"JSON Parse Error: {e}")
+            return {"error": f"JSON Parse Error: {str(e)}"}
     except Exception as e:
         logger.error(f"Error in data extraction: {e}")
         return {"error": str(e)}
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""
     try:
     except Exception as e:
         raise Exception(f"Failed to get content type from S3: {str(e)}")
 # Dependency to check API Key
 def verify_api_key(api_key: str = Header(...)):
     if api_key != API_KEY:
         raise HTTPException(status_code=401, detail="Invalid API Key")
 @app.get("/")
 def read_root():
     return {"message": "Welcome to the Invoice Summarization API!"}
 @app.get("/ocr/extraction")
 def extract_text_from_file(
+        api_key: str = Depends(verify_api_key),
+        file_key: str = Query(..., description="S3 file key for the file"),
+        document_type: str = Query(..., description="Type of document"),
+        entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
     """Extract text from a PDF or Image stored in S3 and process it based on document size."""
     try:
         json_schema = schema_doc.get("json_schema")
         if not json_schema:
+            raise ValueError("Schema is empty or not properly defined.")
+            # Retrieve file from S3 and determine content type
         content_type = get_content_type_from_s3(file_key)
         file_data, _ = fetch_file_from_s3(file_key)
         extracted_data = extract_invoice_data(file_data, content_type, json_schema)
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")