document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 6, 2025

Commit

d11faed

verified ·

1 Parent(s): cb92879

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -43

app.py CHANGED Viewed

@@ -80,6 +80,18 @@ def fetch_file_from_s3(file_key):
     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     system_prompt = "You are an expert in document data extraction."
@@ -88,53 +100,57 @@ def extract_invoice_data(file_data, content_type, json_schema):
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
     base64dataresp = f"data:{content_type};base64,{base64_encoded}"
-    # Determine the correct MIME type for OpenAI
-    if content_type.startswith("image/"):
-        mime_type = content_type  # e.g., image/png, image/jpeg
-    elif content_type == "application/pdf":
-        mime_type = "application/pdf"
-    else:
-        raise ValueError(f"Unsupported content type: {content_type}")
-    try:
-        response = openai.ChatCompletion.create(
-            model="gpt-4o-mini",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:{mime_type};base64,{base64_encoded}"
-                            }
-                        }
-                    ]
-                }
-            ],
-            response_format={
-                "type": "json_schema",
-                "json_schema": json_schema
-            },
-            temperature=0.5,
-            max_tokens=16384
-        )
-        # Clean and parse JSON output
-        content = response.choices[0].message.content.strip()
-        cleaned_content = content.strip().strip('```json').strip('```')
         try:
-            parsed_content = json.loads(cleaned_content)
-            return parsed_content,base64dataresp
-        except json.JSONDecodeError as e:
-            logger.error(f"JSON Parse Error: {e}")
-            return None,base64dataresp
-    except Exception as e:
-        logger.error(f"Error in data extraction: {e}")
-        return {"error": str(e)},base64dataresp
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""

     except Exception as e:
         raise Exception(f"Failed to fetch file from S3: {str(e)}")
+def extract_pdf_text(file_data):
+    """
+    Extracts text from a PDF file using PyMuPDF (fitz).
+    """
+    try:
+        pdf_document = fitz.open(stream=file_data, filetype="pdf")
+        text = "\n".join([page.get_text("text") for page in pdf_document])
+        return text
+    except Exception as e:
+        logger.error(f"PDF Extraction Error: {e}")
+        return None
 # Function to summarize text using OpenAI GPT
 def extract_invoice_data(file_data, content_type, json_schema):
     system_prompt = "You are an expert in document data extraction."
     base64_encoded = base64.b64encode(file_data).decode('utf-8')
     base64dataresp = f"data:{content_type};base64,{base64_encoded}"
+    if content_type == "application/pdf":
+        extracted_text = extract_pdf_text(file_data)
+        if not extracted_text:
+            return {"error": "Failed to extract text from PDF"}, base64dataresp
+        return {"extracted_text": extracted_text}, base64dataresp  # Return plain text for PDFs
+    # Handle Images using OpenAI
+    elif content_type.startswith("image/"):
         try:
+            response = openai.ChatCompletion.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{content_type};base64,{base64_encoded}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": json_schema
+                },
+                temperature=0.5,
+                max_tokens=16384
+            )
+            # Clean and parse JSON output
+            content = response.choices[0].message.content.strip()
+            cleaned_content = content.strip().strip('```json').strip('```')
+            try:
+                parsed_content = json.loads(cleaned_content)
+                return parsed_content, base64dataresp  # Return extracted structured data
+            except json.JSONDecodeError as e:
+                logger.error(f"JSON Parse Error: {e}")
+                return None, base64dataresp
+        except Exception as e:
+            logger.error(f"Error in OpenAI image processing: {e}")
+            return {"error": str(e)}, base64dataresp
+    else:
+        raise ValueError(f"Unsupported content type: {content_type}")
 def get_content_type_from_s3(file_key):
     """Fetch the content type (MIME type) of a file stored in S3."""