File size: 8,475 Bytes
59f9119
 
 
 
4434125
59f9119
 
 
 
39fc86b
dca3ec3
 
4434125
 
 
 
 
 
 
 
5e852da
4434125
 
 
dca3ec3
 
 
59f9119
9236c5b
59f9119
 
 
4434125
 
59f9119
4434125
 
 
 
 
59f9119
4434125
 
59f9119
4434125
 
 
 
 
 
 
 
 
 
dca3ec3
4434125
 
 
5e852da
dca3ec3
4434125
 
 
 
a5afdd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dca3ec3
4434125
 
 
 
a6e39e5
4434125
 
 
 
 
bac35d0
45a2314
a6e39e5
4434125
 
f2cdaec
4434125
 
cbfc36c
 
 
 
 
 
 
 
4434125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e852da
4434125
 
5e852da
4434125
 
0852412
4434125
 
 
 
 
 
 
5e852da
4434125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0852412
9bcc761
4434125
 
 
 
 
 
 
 
 
 
 
5e852da
4434125
 
 
9bcc761
 
4434125
 
 
 
 
 
 
9bcc761
39fc86b
b7703b3
59f9119
 
39fc86b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import uvicorn
from fastapi.staticfiles import StaticFiles
import hashlib
from enum import Enum
from fastapi import FastAPI,Header, Query,Depends,HTTPException
from paddleocr import PaddleOCR, PPStructure, save_structure_res
from PIL import Image
import io
import numpy as np
import fitz  # PyMuPDF for PDF handling
import logging

import boto3
import openai
import os
import traceback  # For detailed traceback of errors
import re
import json
from dotenv import load_dotenv
import uvicorn
import base64

load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(docs_url='/')
use_gpu = False
output_dir = 'output'

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# AWS S3 Configuration
API_KEY = os.getenv("API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")

# OpenAI Configuration
openai.api_key = os.getenv("OPENAI_API_KEY")

# S3 Client
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)

# Function to fetch file from S3

def fetch_file_from_s3_file(file_key):
    try:
        response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
        content_type = response['ContentType']  # Retrieve MIME type
        file_data = response['Body'].read()
        return file_data, content_type  # Return file data as BytesIO
    except Exception as e:
        raise Exception(f"Failed to fetch file from S3: {str(e)}")

# Function to summarize text using OpenAI GPT
def summarize_text(text):
    system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:

Vendor Information:

Vendor Name
Vendor Address
Vendor GST No.
Invoice Details:

Invoice No./Bill No./Consecutive Serial No./Serial No. of Invoice/INVOICE → Considered as InvoiceNo.
Invoice Date/Date/Date of Supply/Bill Date/Issuing Date/Dated → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
Invoice Currency/Currency
Base Amount/Amount
Tax Amount
Total Invoice Amount
Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
Billing Party Information:

Invoice Party/Bill To Name/Sold-to-Party/Taxpayer Name/M/s./CB No./Buyer (Bill to)/Billing Party/Customer Name & Address/Name → Considered as BillToName.
Invoice Party to / Bill To Address
Invoice Party to / Bill To GST No.
Shipping and References:

MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
Shipping Order
You should extract this data and structure it into a table-like format in the following JSON format:
{
  "invoice_headers": {
    "VendorName": "",
    "VendorAddress": "",
    "VendorGSTNo": "",
    "InvoiceNo": "",
    "InvoiceDate": "",
    "InvoiceCurrency": "",
    "BaseAmount": "",
    "TaxAmount": "",
    "TotalInvoiceAmt": "",
    "TypeofInvoice": "",
    "BillToName": "",
    "BillToAddress": "",
    "BillToGSTNO": "",
    "RefNo": "",
    "ShippingOrder": ""
  },
  "line_items": [
    {    
      "Description": "",
      "TaxPercentage": "",
      "TaxAmount": "",
      "Amount": 0
    }
  ]
}
Guidelines for Processing:

Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
Convert the Invoice Date to the specified dd-MMM-yyyy format.
Use the correct currency and amounts for each invoice field.
For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
If certain values are missing or not applicable, leave them empty or set them as null where necessary.
This JSON format will be used to store and manage invoices in a structured and uniform way."""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{text}"}
            ],
            temperature=0.5,
            max_tokens=16384
        )
        content = response.choices[0].message.content.strip()
        print("Before content:", content)
        cleaned_content = re.sub(r'^.*```json\n', '', content)  # Remove '```json\n' at the beginning
        cleaned_content = re.sub(r'\n```$', '', cleaned_content)  # Remove '\n```' at the end

        # Step 2: Parse the cleaned content as JSON
        #parsed_content = json.loads(cleaned_content)

        # Step 3: Print the parsed JSON object
        try:
            parsed_content = json.loads(cleaned_content)
            return parsed_content
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            # Optionally, print the cleaned content to debug
            print("Cleaned content:", cleaned_content)
            return None
    except Exception as e:
        return f"Error in summarization: {str(e)}"
# Dependency to check API Key
def verify_api_key(api_key: str = Header(...)):
    if api_key != API_KEY:
        raise HTTPException(status_code=401, detail="Invalid API Key")

@app.get("/")
def read_root():
    return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}

@app.get("/ocr/extraction")
def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")):
    """
    Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
    """
    try:
        # Fetch file from S3
        file_data, content_type = fetch_file_from_s3_file(file_key)

        extracted_text = []
        base64Data = base64.b64encode(file_data).decode('utf-8')
        # Determine file type based on MIME type
        if content_type.startswith("image/"):  # Image file
            image = Image.open(io.BytesIO(file_data)).convert("RGB")  # Use BytesIO stream directly
            image_np = np.array(image)  # Convert to NumPy array
            result = ocr.ocr(image_np, cls=True)
            base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
            # Extract text from OCR results
            for line in result:
                for word_info in line:
                    extracted_text.append(word_info[1][0])

        elif content_type == "application/pdf":  # PDF file
            # Open PDF using PyMuPDF
            pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")

            extracted_text = []

            # Process each page in the PDF
            for page_number in range(len(pdf_document)):
                page = pdf_document[page_number]

                # Render the page as an image
                pix = page.get_pixmap()
                image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

                # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
                image_np = np.array(image)

                # Run OCR on the image
                result = ocr.ocr(image_np, cls=True)
                for line in result:
                    for word_info in line:
                        extracted_text.append(word_info[1][0])

            pdf_document.close()
            base64DataResp = f"data:application/pdf;base64,{base64Data}"
        else:
            return {"error": f"Unsupported file type: {content_type}"}

        # Combine extracted text
        full_text = " ".join(extracted_text)

        # Summarize the extracted text
        summary = summarize_text(full_text)

        return {
            "file_key": file_key,
            "file_type": content_type,
            "base64DataResp":base64DataResp,
            "extracted_text": full_text,
            "summary": summary
        }

    except Exception as e:
        # Detailed error information
        error_details = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc()
        }
        return {"error": error_details}
     
# Serve the output folder as static files
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

if __name__ == '__main__':
    uvicorn.run(app=app)