document-extraction

Sleeping

vkumartr commited on Feb 7, 2025

Commit

78199be

verified ·

1 Parent(s): a5871da

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -99,7 +99,7 @@ def extract_invoice_data(file_data, content_type, json_schema):
     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
-    system_prompt = "You are an expert in document data extraction."
     base64_images = []
     if content_type == "application/pdf":
@@ -107,13 +107,13 @@ def extract_invoice_data(file_data, content_type, json_schema):
             images = convert_from_bytes(file_data)  # Convert PDF to images
             if len(images) > 2:
-                raise ValueError("PDF contains more than 2 pages.")
-            for img in images[:2]:
                 img_byte_arr = io.BytesIO()
-                img.save(img_byte_arr, format="PNG", dpi=(300, 300))
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
-                base64_images.append(f"data:image/png;base64,{base64_encoded}")
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")

     Extracts data from a PDF (converted to images) or an image.
     Only PDFs with 1 or 2 pages are allowed.
     """
+    system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
     base64_images = []
     if content_type == "application/pdf":
             images = convert_from_bytes(file_data)  # Convert PDF to images
             if len(images) > 2:
+                raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
+            for img in images[:2]:  # Convert up to 2 pages
                 img_byte_arr = io.BytesIO()
+                img.save(img_byte_arr, format="PNG")
                 base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                base64_images.append(f"data:{content_type};base64,{base64_encoded}")
         except Exception as e:
             logger.error(f"Error converting PDF to image: {e}")