document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

dd8eaf2

verified ·

1 Parent(s): 57f0216

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -14

app.py CHANGED Viewed

@@ -84,7 +84,6 @@ def fetch_file_from_s3(file_key):
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
-# Function to summarize text using OpenAI GPT
 # Updated extraction function that handles PDF and image files differently
 def extract_invoice_data(file_data, content_type, json_schema):
     """
@@ -92,14 +91,24 @@ def extract_invoice_data(file_data, content_type, json_schema):
     For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
     """
     system_prompt = "You are an expert in document data extraction."
     if content_type == "application/pdf":
         # Use PyMuPDF to extract text directly from the PDF
         try:
             doc = fitz.open(stream=file_data, filetype="pdf")
             extracted_text = ""
             for page in doc:
                 extracted_text += page.get_text()
         except Exception as e:
             logger.error(f"Error extracting text from PDF: {e}")
             raise
@@ -112,18 +121,35 @@ def extract_invoice_data(file_data, content_type, json_schema):
         )
     elif content_type.startswith("image/"):
-        # For images, encode as Base64 and pass to OpenAI
-        base64_encoded = base64.b64encode(file_data).decode('utf-8')
-        # In this example we assume the model accepts image inputs via a Base64 data URL.
-        # (This requires access to a multimodal model.)
-        prompt = (
-            f"Extract the invoice data from the following image. "
-            f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
-            f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
-        )
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
     try:
         response = openai.ChatCompletion.create(
             model="gpt-4o-mini",
@@ -136,15 +162,15 @@ def extract_invoice_data(file_data, content_type, json_schema):
         )
         content = response.choices[0].message.content.strip()
-        # Clean and parse JSON output (remove markdown formatting if present)
         cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
-            return parsed_content
         except json.JSONDecodeError as e:
             logger.error(f"JSON Parse Error: {e}")
-            return None
     except Exception as e:
         logger.error(f"Error in data extraction: {e}")

         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Updated extraction function that handles PDF and image files differently
 def extract_invoice_data(file_data, content_type, json_schema):
     """
     For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
     """
     system_prompt = "You are an expert in document data extraction."
+    base64_encoded_images = []  # To store Base64-encoded image data
+    extracted_data = {}
     if content_type == "application/pdf":
         # Use PyMuPDF to extract text directly from the PDF
         try:
             doc = fitz.open(stream=file_data, filetype="pdf")
+            num_pages = doc.page_count
+            # Check if the number of pages exceeds 2
+            if num_pages > 2:
+                raise ValueError("The PDF contains more than 2 pages, extraction not supported.")
             extracted_text = ""
             for page in doc:
                 extracted_text += page.get_text()
         except Exception as e:
             logger.error(f"Error extracting text from PDF: {e}")
             raise
         )
     elif content_type.startswith("image/"):
+        # For images, determine if more than 2 images are provided
+        try:
+            img = Image.open(io.BytesIO(file_data))  # Open the image file
+            num_images = img.n_frames  # Get number of images (pages in the image file)
+            if num_images > 2:
+                raise ValueError("The image file contains more than 2 pages, extraction not supported.")
+            # Process each image page if there are 1 or 2 pages
+            for page_num in range(num_images):
+                img.seek(page_num)  # Move to the current page
+                img_bytes = io.BytesIO()
+                img.save(img_bytes, format="PNG")  # Save each page as a PNG image in memory
+                base64_encoded = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
+                base64_encoded_images.append(base64_encoded)
+            # Build a prompt containing the image data for OpenAI
+            prompt = f"Extract the invoice data from the following images (Base64 encoded). Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
+            for base64_image in base64_encoded_images:
+                prompt += f"Image Data URL: data:{content_type};base64,{base64_image}\n"
+        except Exception as e:
+            logger.error(f"Error handling images: {e}")
+            raise
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
+    # Send request to OpenAI for data extraction
     try:
         response = openai.ChatCompletion.create(
             model="gpt-4o-mini",
         )
         content = response.choices[0].message.content.strip()
         cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
+            extracted_data["extracted_json"] = parsed_content  # Store the parsed JSON data
+            return extracted_data
         except json.JSONDecodeError as e:
             logger.error(f"JSON Parse Error: {e}")
+            return {"error": f"JSON Parse Error: {str(e)}"}
     except Exception as e:
         logger.error(f"Error in data extraction: {e}")