document-extraction

Sleeping

App Files Files Community

vkumartr commited on Feb 5, 2025

Commit

961cc29

verified ·

1 Parent(s): b415d4c

JSON Schema modified

Browse files

Files changed (1) hide show

app.py +207 -87

app.py CHANGED Viewed

@@ -118,8 +118,7 @@ You should extract this data and structure it into a table-like format in the fo
 #     "InvoiceCurrency": "",
 #     "BaseAmount": "",
 #     "TaxAmount": "",
-#     "TotalInvoiceAmt": "",
-#     "TypeofInvoice": "",
 #     "CustomerName": "",
 #     "CustomerAddress": "",
 #     "CustomerGSTNO": "",
@@ -137,69 +136,107 @@ You should extract this data and structure it into a table-like format in the fo
 # }
 {
-    "$schema": "http://json-schema.org/draft-07/schema#",
-    "type": "object",
-    "title": "Invoice Information Extractor",
-    "description": "Schema for extracting specific information from invoices",
-    "properties": {
-        "invoice_headers": {
-            "type": "object",
-            "properties": {
-                "VendorName": { "type": "string", "description": "The name of the vendor" },
-                "VendorAddress": { "type": "string", "description": "The address of the vendor" },
-                "VendorGSTNo": { "type": "string", "description": "The GST number of the vendor" },
-                "InvoiceNo": { "type": "string", "description": "The number of the invoice" },
-                "InvoiceDate": { "type": "string", "format": "date", "description": "The date of the invoice in dd-MMM-yyyy format" },
-                "InvoiceCurrency": { "type": "string", "description": "The currency used in the invoice (e.g., USD, INR, AUD)" },
-                "BaseAmount": { "type": "number", "description": "The base amount before tax" },
-                "TaxAmount": { "type": "number", "description": "The tax amount on the invoice" },
-                "TotalInvoiceAmt": { "type": "number", "description": "The total amount on the invoice" },
-                "TypeofInvoice": { "type": "string", "description": "Type of invoice (e.g., Tax Invoice, Proforma Invoice)" },
-                "CustomerName": { "type": "string", "description": "The name of the customer" },
-                "CustomerAddress": { "type": "string", "description": "The address of the customer" },
-                "CustomerGSTNO": { "type": "string", "description": "The GST number of the customer" },
-                "RefNo": { "type": "string", "description": "Reference number related to shipping or order" },
-                "ShippingOrder": { "type": "string", "description": "Shipping order details" }
-            },
-            "required": [
-                "VendorName",
-                "VendorAddress",
-                "VendorGSTNo",
-                "InvoiceNo",
-                "InvoiceDate",
-                "InvoiceCurrency",
-                "BaseAmount",
-                "TaxAmount",
-                "TotalInvoiceAmt",
-                "TypeofInvoice",
-                "CustomerName",
-                "CustomerAddress",
-                "CustomerGSTNO",
-                "RefNo",
-                "ShippingOrder"
-            ],
-            "additionalProperties": false
-        },
-        "line_items": {
             "type": "array",
-            "description": "List of line items on the invoice",
             "items": {
-                "type": "object",
-                "properties": {
-                    "Description": { "type": "string", "description": "Description of the product/service" },
-                    "TaxPercentage": { "type": "number", "description": "Tax percentage applied to the line item" },
-                    "TaxAmount": { "type": "number", "description": "Tax amount for the line item" },
-                    "Amount": { "type": "number", "description": "The total amount for the line item" }
                 },
-                "required": ["Description", "TaxPercentage", "TaxAmount", "Amount"],
-                "additionalProperties": false
-            }
-        }
-    },
-    "required": ["invoice_headers", "line_items"],
-    "additionalProperties": false
 }
 Guidelines for Processing:
 Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
@@ -244,6 +281,78 @@ def verify_api_key(api_key: str = Header(...)):
 def read_root():
     return {"message": "Welcome to the Invoice Summarization API!"}
 @app.get("/ocr/extraction")
 def ocr_from_s3(
     api_key: str = Depends(verify_api_key),
@@ -252,69 +361,80 @@ def ocr_from_s3(
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
     """
-    (PDF or Image) stored in S3 and summarize the text using GPT.
     """
     try:
         # Fetch file from S3
         file_data, content_type = fetch_file_from_s3_file(file_key)
         extracted_text = []
-        base64Data = base64.b64encode(file_data).decode('utf-8')
-        # Process PDF or Image file
         if content_type.startswith("image/"):  # Image file
-            image = Image.open(io.BytesIO(file_data)).convert("RGB")  # Use BytesIO stream directly
-            image_np = np.array(image)  # Convert to NumPy array
             base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
         elif content_type == "application/pdf":  # PDF file
-            # Open PDF using PyMuPDF
             pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
-            extracted_text = []
-            # Process each page in the PDF
-            for page_number in range(len(pdf_document)):
                 page = pdf_document[page_number]
-                extracted_text.append(page.get_text("text"))  # Extract text from PDF
             pdf_document.close()
-            base64DataResp = f"data:application/pdf;base64,{base64Data}"
-        else:
-            return {"error": f"Unsupported file type: {content_type}"}
-        # Combine extracted text
-        full_text = " ".join(extracted_text)
-        # Summarize the extracted text
-        summary = summarize_text(full_text)
-        # Document structure for MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
-            "entityrefkey": entity_ref_key,
-            "base64DataResp": base64DataResp,
-            "extracted_text": full_text,
-            "summary": summary,
         }
-        # Insert into MongoDB
         inserted_doc = invoice_collection.insert_one(document)
-        document_id = str(inserted_doc.inserted_id)  # Convert ObjectId to string
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "file_key": file_key,
-            "summary": summary
         }
     except Exception as e:
-        # Detailed error information
         error_details = {
             "error_type": type(e).__name__,
             "error_message": str(e),
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

 #     "InvoiceCurrency": "",
 #     "BaseAmount": "",
 #     "TaxAmount": "",
+#     "TotalInvoiceAmt": "",x
 #     "CustomerName": "",
 #     "CustomerAddress": "",
 #     "CustomerGSTNO": "",
 # }
 {
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "invoice",
+      "strict": true,
+      "schema": {
+        "type": "object",
+        "title": "Invoice Information Extractor",
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "properties": {
+          "LineItems": {
             "type": "array",
             "items": {
+              "type": "object",
+              "required": [
+                "ProductCode",
+                "Description",
+                "Amount"
+              ],
+              "properties": {
+                "Amount": {
+                  "type": "number",
+                  "title": "Amount",
+                  "description": "The amount of the product"
                 },
+                "Description": {
+                  "type": "string",
+                  "title": "Description",
+                  "description": "Description of the product"
+                },
+                "ProductCode": {
+                  "type": "string",
+                  "title": "Product Code",
+                  "description": "The code of the product"
+                }
+              },
+              "additionalProperties": false
+            },
+            "title": "Line Items",
+            "description": "List of line items on the invoice"
+          },
+          "TaxAmount": {
+            "type": "number",
+            "title": "Tax Amount",
+            "description": "The tax amount on the invoice"
+          },
+          "VendorGST": {
+            "type": "string",
+            "title": "Vendor GST",
+            "description": "The GST number of the vendor"
+          },
+          "VendorName": {
+            "type": "string",
+            "title": "Vendor Name",
+            "description": "The name of the vendor"
+          },
+          "InvoiceDate": {
+            "type": "string",
+            "title": "Invoice Date",
+            "description": "The date of the invoice in dd-MMM-yyyy format"
+          },
+          "TotalAmount": {
+            "type": "number",
+            "title": "Total Amount",
+            "description": "The total amount on the invoice"
+          },
+          "InvoiceNumber": {
+            "type": "string",
+            "title": "Invoice Number",
+            "description": "The number of the invoice"
+          },
+          "VendorAddress": {
+            "type": "string",
+            "title": "Vendor Address",
+            "description": "The address of the vendor"
+          },
+          "InvoiceCurrency": {
+            "type": "string",
+            "title": "Invoice Currency",
+            "description": "The currency used in the invoice, e.g., USD, INR, AUD"
+          }
+        },
+        "required": [
+          "LineItems",
+          "TaxAmount",
+          "VendorGST",
+          "VendorName",
+          "InvoiceDate",
+          "TotalAmount",
+          "InvoiceNumber",
+          "VendorAddress",
+          "InvoiceCurrency"
+        ],
+        "additionalProperties": false,
+        "description": "Schema for extracting specific information from invoices"
+      }
+    }
+  }
 }
 Guidelines for Processing:
 Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
 def read_root():
     return {"message": "Welcome to the Invoice Summarization API!"}
+# @app.get("/ocr/extraction")
+# def ocr_from_s3(
+#     api_key: str = Depends(verify_api_key),
+#     file_key: str = Query(..., description="S3 file key for the file"),
+#     document_type: str = Query(..., description="Type of document"),
+#     entity_ref_key: str = Query(..., description="Entity Reference Key")
+# ):
+#     """
+#     (PDF or Image) stored in S3 and summarize the text using GPT.
+#     """
+#     try:
+#         # Fetch file from S3
+#         file_data, content_type = fetch_file_from_s3_file(file_key)
+#         extracted_text = []
+#         base64Data = base64.b64encode(file_data).decode('utf-8')
+#         # Process PDF or Image file
+#         if content_type.startswith("image/"):  # Image file
+#             image = Image.open(io.BytesIO(file_data)).convert("RGB")  # Use BytesIO stream directly
+#             image_np = np.array(image)  # Convert to NumPy array
+#             base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
+#         elif content_type == "application/pdf":  # PDF file
+#             # Open PDF using PyMuPDF
+#             pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
+#             extracted_text = []
+#             # Process each page in the PDF
+#             for page_number in range(len(pdf_document)):
+#                 page = pdf_document[page_number]
+#                 extracted_text.append(page.get_text("text"))  # Extract text from PDF
+#             pdf_document.close()
+#             base64DataResp = f"data:application/pdf;base64,{base64Data}"
+#         else:
+#             return {"error": f"Unsupported file type: {content_type}"}
+#         # Combine extracted text
+#         full_text = " ".join(extracted_text)
+#         # Summarize the extracted text
+#         summary = summarize_text(full_text)
+#         # Document structure for MongoDB
+#         document = {
+#             "file_key": file_key,
+#             "file_type": content_type,
+#             "document_type": document_type,
+#             "entityrefkey": entity_ref_key,
+#             "base64DataResp": base64DataResp,
+#             "extracted_text": full_text,
+#             "summary": summary,
+#         }
+#         # Insert into MongoDB
+#         inserted_doc = invoice_collection.insert_one(document)
+#         document_id = str(inserted_doc.inserted_id)  # Convert ObjectId to string
+#         return {
+#             "message": "Document successfully stored in MongoDB",
+#             "document_id": document_id,
+#             "file_key": file_key,
+#             "summary": summary
+#         }
+#     except Exception as e:
+#         # Detailed error information
+#         error_details = {
+#             "error_type": type(e).__name__,
+#             "error_message": str(e),
+#             "traceback": traceback.format_exc()
+#         }
+#         return {"error": error_details}
 @app.get("/ocr/extraction")
 def ocr_from_s3(
     api_key: str = Depends(verify_api_key),
     entity_ref_key: str = Query(..., description="Entity Reference Key")
 ):
     """
+    Extract text from a PDF or Image stored in S3 and process it based on document size.
+    If more than 2 pages, skip Base64 conversion and summarization.
+    Store extracted data in MongoDB.
     """
     try:
         # Fetch file from S3
         file_data, content_type = fetch_file_from_s3_file(file_key)
         extracted_text = []
+        base64DataResp = None
+        summary = None
         if content_type.startswith("image/"):  # Image file
+            image = Image.open(io.BytesIO(file_data)).convert("RGB")
+            extracted_text.append(pytesseract.image_to_string(image))  # Extract text using OCR
+            # If single image, store Base64
+            base64Data = base64.b64encode(file_data).decode('utf-8')
             base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
         elif content_type == "application/pdf":  # PDF file
             pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
+            num_pages = len(pdf_document)
+            for page_number in range(num_pages):
                 page = pdf_document[page_number]
+                extracted_text.append(page.get_text("text"))
             pdf_document.close()
+            # If 2 pages or less, store Base64
+            if num_pages <= 2:
+                base64Data = base64.b64encode(file_data).decode('utf-8')
+                base64DataResp = f"data:application/pdf;base64,{base64Data}"
+            # If 2 pages or less, generate summary
+            if num_pages <= 2:
+                full_text = " ".join(extracted_text)
+                summary = summarize_text(full_text)
+        else:
+            return {"error": f"Unsupported file type: {content_type}"}
+        # Store extracted data in MongoDB
         document = {
             "file_key": file_key,
             "file_type": content_type,
             "document_type": document_type,
+            "entity_ref_key": entity_ref_key,
+            "num_pages": len(extracted_text),  # Store page count
+            "base64DataResp": base64DataResp,  # Only for small files
+            "extracted_text": " ".join(extracted_text),
+            "summary": summary,  # Only for small files
         }
         inserted_doc = invoice_collection.insert_one(document)
+        document_id = str(inserted_doc.inserted_id)
         return {
             "message": "Document successfully stored in MongoDB",
             "document_id": document_id,
             "file_key": file_key,
+            "num_pages": len(extracted_text),
+            "summary": summary if summary else "Skipped for large documents"
         }
     except Exception as e:
         error_details = {
             "error_type": type(e).__name__,
             "error_message": str(e),
             "traceback": traceback.format_exc()
         }
         return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")