document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

851fdc1

verified ·

1 Parent(s): b6edac7

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -24

app.py CHANGED Viewed

@@ -81,17 +81,42 @@ def fetch_file_from_s3(file_key):
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     system_prompt = "You are an expert in document data extraction."
-    # Convert file to Base64
-    base64_encoded = base64.b64encode(file_data).decode('utf-8')
-    # Determine the correct MIME type for OpenAI
-    if content_type.startswith("image/"):
-        mime_type = content_type  # e.g., image/png, image/jpeg
-    elif content_type == "application/pdf":
-        mime_type = "application/pdf"
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
@@ -100,30 +125,16 @@ def extract_invoice_data(file_data, content_type, json_schema):
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": system_prompt},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:{mime_type};base64,{base64_encoded}"
-                            }
-                        }
-                    ]
-                }
             ],
-            response_format={
-                "type": "json_schema",
-                "json_schema": json_schema
-            },
             temperature=0.5,
             max_tokens=16384
         )
-        # Clean and parse JSON output
         content = response.choices[0].message.content.strip()
-        cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
             return parsed_content

         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Function to summarize text using OpenAI GPT
+# Updated extraction function that handles PDF and image files differently
 def extract_invoice_data(file_data, content_type, json_schema):
+    """
+    For PDFs: Extract the embedded text using PyMuPDF (no OCR involved)
+    For Images: Pass the Base64-encoded image to OpenAI (assuming a multimodal model)
+    """
     system_prompt = "You are an expert in document data extraction."
+    if content_type == "application/pdf":
+        # Use PyMuPDF to extract text directly from the PDF
+        try:
+            doc = fitz.open(stream=file_data, filetype="pdf")
+            extracted_text = ""
+            for page in doc:
+                extracted_text += page.get_text()
+        except Exception as e:
+            logger.error(f"Error extracting text from PDF: {e}")
+            raise
+        # Build a prompt containing the extracted text and the schema
+        prompt = (
+            f"Extract the invoice data from the following PDF text. "
+            f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
+            f"PDF Text:\n{extracted_text}"
+        )
+    elif content_type.startswith("image/"):
+        # For images, encode as Base64 and pass to OpenAI
+        base64_encoded = base64.b64encode(file_data).decode('utf-8')
+        # In this example we assume the model accepts image inputs via a Base64 data URL.
+        # (This requires access to a multimodal model.)
+        prompt = (
+            f"Extract the invoice data from the following image. "
+            f"Return only valid JSON that adheres to this schema:\n\n{json.dumps(json_schema, indent=2)}\n\n"
+            f"Image Data URL:\n data:{content_type};base64,{base64_encoded}"
+        )
     else:
         raise ValueError(f"Unsupported content type: {content_type}")
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
             ],
             temperature=0.5,
             max_tokens=16384
         )
         content = response.choices[0].message.content.strip()
+        # Clean and parse JSON output (remove markdown formatting if present)
+        cleaned_content = content.strip().strip('```json').strip('```')
         try:
             parsed_content = json.loads(cleaned_content)
             return parsed_content