document-extraction

Sleeping

File size: 8,475 Bytes

import uvicorn
from fastapi.staticfiles import StaticFiles
import hashlib
from enum import Enum
from fastapi import FastAPI,Header, Query,Depends,HTTPException
from paddleocr import PaddleOCR, PPStructure, save_structure_res
from PIL import Image
import io
import numpy as np
import fitz  # PyMuPDF for PDF handling
import logging

import boto3
import openai
import os
import traceback  # For detailed traceback of errors
import re
import json
from dotenv import load_dotenv
import uvicorn
import base64

load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(docs_url='/')
use_gpu = False
output_dir = 'output'

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# AWS S3 Configuration
API_KEY = os.getenv("API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")

# OpenAI Configuration
openai.api_key = os.getenv("OPENAI_API_KEY")

# S3 Client
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)

# Function to fetch file from S3

def fetch_file_from_s3_file(file_key):
    try:
        response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
        content_type = response['ContentType']  # Retrieve MIME type
        file_data = response['Body'].read()
        return file_data, content_type  # Return file data as BytesIO
    except Exception as e:
        raise Exception(f"Failed to fetch file from S3: {str(e)}")

# Function to summarize text using OpenAI GPT
def summarize_text(text):
    system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:

Vendor Information:

Vendor Name
Vendor Address
Vendor GST No.
Invoice Details:

Invoice No./Bill No./Consecutive Serial No./Serial No. of Invoice/INVOICE → Considered as InvoiceNo.
Invoice Date/Date/Date of Supply/Bill Date/Issuing Date/Dated → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
Invoice Currency/Currency
Base Amount/Amount
Tax Amount
Total Invoice Amount
Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
Billing Party Information:

Invoice Party/Bill To Name/Sold-to-Party/Taxpayer Name/M/s./CB No./Buyer (Bill to)/Billing Party/Customer Name & Address/Name → Considered as BillToName.
Invoice Party to / Bill To Address
Invoice Party to / Bill To GST No.
Shipping and References:

MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
Shipping Order
You should extract this data and structure it into a table-like format in the following JSON format:
{
  "invoice_headers": {
    "VendorName": "",
    "VendorAddress": "",
    "VendorGSTNo": "",
    "InvoiceNo": "",
    "InvoiceDate": "",
    "InvoiceCurrency": "",
    "BaseAmount": "",
    "TaxAmount": "",
    "TotalInvoiceAmt": "",
    "TypeofInvoice": "",
    "BillToName": "",
    "BillToAddress": "",
    "BillToGSTNO": "",
    "RefNo": "",
    "ShippingOrder": ""
  },
  "line_items": [
    {    
      "Description": "",
      "TaxPercentage": "",
      "TaxAmount": "",
      "Amount": 0
    }
  ]
}
Guidelines for Processing:

Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
Convert the Invoice Date to the specified dd-MMM-yyyy format.
Use the correct currency and amounts for each invoice field.
For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
If certain values are missing or not applicable, leave them empty or set them as null where necessary.
This JSON format will be used to store and manage invoices in a structured and uniform way."""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{text}"}
            ],
            temperature=0.5,
            max_tokens=16384
        )
        content = response.choices[0].message.content.strip()
        print("Before content:", content)
        cleaned_content = re.sub(r'^.*```json\n', '', content)  # Remove '```json\n' at the beginning
        cleaned_content = re.sub(r'\n```$', '', cleaned_content)  # Remove '\n```' at the end

        # Step 2: Parse the cleaned content as JSON
        #parsed_content = json.loads(cleaned_content)

        # Step 3: Print the parsed JSON object
        try:
            parsed_content = json.loads(cleaned_content)
            return parsed_content
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            # Optionally, print the cleaned content to debug
            print("Cleaned content:", cleaned_content)
            return None
    except Exception as e:
        return f"Error in summarization: {str(e)}"
# Dependency to check API Key
def verify_api_key(api_key: str = Header(...)):
    if api_key != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API Key")

@app.get("/")
def read_root():
    return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}

@app.get("/ocr/extraction")
def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")):
    """
    Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
    """
    try:
        # Fetch file from S3
        file_data, content_type = fetch_file_from_s3_file(file_key)

        extracted_text = []
        base64Data = base64.b64encode(file_data).decode('utf-8')
        # Determine file type based on MIME type
        if content_type.startswith("image/"):  # Image file
            image = Image.open(io.BytesIO(file_data)).convert("RGB")  # Use BytesIO stream directly
            image_np = np.array(image)  # Convert to NumPy array
            result = ocr.ocr(image_np, cls=True)
            base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
            # Extract text from OCR results
            for line in result:
                for word_info in line:
                    extracted_text.append(word_info[1][0])

        elif content_type == "application/pdf":  # PDF file
            # Open PDF using PyMuPDF
            pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")

            extracted_text = []

            # Process each page in the PDF
            for page_number in range(len(pdf_document)):
                page = pdf_document[page_number]

                # Render the page as an image
                pix = page.get_pixmap()
                image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

                # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
                image_np = np.array(image)

                # Run OCR on the image
                result = ocr.ocr(image_np, cls=True)
                for line in result:
                    for word_info in line:
                        extracted_text.append(word_info[1][0])

            pdf_document.close()
            base64DataResp = f"data:application/pdf;base64,{base64Data}"
        else:
            return {"error": f"Unsupported file type: {content_type}"}

        # Combine extracted text
        full_text = " ".join(extracted_text)

        # Summarize the extracted text
        summary = summarize_text(full_text)

        return {
            "file_key": file_key,
            "file_type": content_type,
            "base64DataResp":base64DataResp,
            "extracted_text": full_text,
            "summary": summary
        }

    except Exception as e:
        # Detailed error information
        error_details = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc()
        }
        return {"error": error_details}
     
# Serve the output folder as static files
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

if __name__ == '__main__':
    uvicorn.run(app=app)