import uvicorn from fastapi.staticfiles import StaticFiles import hashlib from enum import Enum from fastapi import FastAPI,Header, Query,Depends,HTTPException from paddleocr import PaddleOCR, PPStructure, save_structure_res from PIL import Image import io import numpy as np import fitz # PyMuPDF for PDF handling import logging import boto3 import openai import os import traceback # For detailed traceback of errors import re import json from dotenv import load_dotenv import uvicorn import base64 load_dotenv() # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(docs_url='/') use_gpu = False output_dir = 'output' # Initialize PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='en') # AWS S3 Configuration API_KEY = os.getenv("API_KEY") AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") # OpenAI Configuration openai.api_key = os.getenv("OPENAI_API_KEY") # S3 Client s3_client = boto3.client( 's3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY ) # Function to fetch file from S3 def fetch_file_from_s3_file(file_key): try: response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key) content_type = response['ContentType'] # Retrieve MIME type file_data = response['Body'].read() return file_data, content_type # Return file data as BytesIO except Exception as e: raise Exception(f"Failed to fetch file from S3: {str(e)}") # Function to summarize text using OpenAI GPT def summarize_text(text): system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details: Vendor Information: Vendor Name Vendor Address Vendor GST No. Invoice Details: Invoice No. Invoice Date → Considered as InvoiceDate (formatted as dd-MMM-yyyy). Invoice Currency/Currency Base Amount/Amount Tax Amount Total Invoice Amount Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.) Customer Information: Customer Name Customer Address Customer GST No. Shipping and References: MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo. Shipping Order You should extract this data and structure it into a table-like format in the following JSON format: { "invoice_headers": { "VendorName": "", "VendorAddress": "", "VendorGSTNo": "", "InvoiceNo": "", "InvoiceDate": "", "InvoiceCurrency": "", "BaseAmount": "", "TaxAmount": "", "TotalInvoiceAmt": "", "TypeofInvoice": "", "CustomerName": "", "CustomerAddress": "", "CustomerGSTNO": "", "RefNo": "", "ShippingOrder": "" }, "line_items": [ { "Description": "", "TaxPercentage": "", "TaxAmount": "", "Amount": 0 } ] } Guidelines for Processing: Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.). Convert the Invoice Date to the specified dd-MMM-yyyy format. Use the correct currency and amounts for each invoice field. For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount. If certain values are missing or not applicable, leave them empty or set them as null where necessary. This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide.""" try: response = openai.ChatCompletion.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"{text}"} ], temperature=0.5, max_tokens=16384 ) content = response.choices[0].message.content.strip() print("Before content:", content) cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end # Step 2: Parse the cleaned content as JSON #parsed_content = json.loads(cleaned_content) # Step 3: Print the parsed JSON object try: parsed_content = json.loads(cleaned_content) return parsed_content except json.JSONDecodeError as e: print("Error parsing JSON:", e) # Optionally, print the cleaned content to debug print("Cleaned content:", cleaned_content) return None except Exception as e: return f"Error in summarization: {str(e)}" # Dependency to check API Key def verify_api_key(api_key: str = Header(...)): if api_key != API_KEY: raise HTTPException(status_code=401, detail="Invalid API Key") @app.get("/") def read_root(): return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"} @app.get("/ocr/extraction") def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")): """ Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT. """ try: # Fetch file from S3 file_data, content_type = fetch_file_from_s3_file(file_key) extracted_text = [] base64Data = base64.b64encode(file_data).decode('utf-8') # Determine file type based on MIME type if content_type.startswith("image/"): # Image file image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly image_np = np.array(image) # Convert to NumPy array result = ocr.ocr(image_np, cls=True) base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}" # Extract text from OCR results for line in result: for word_info in line: extracted_text.append(word_info[1][0]) elif content_type == "application/pdf": # PDF file # Open PDF using PyMuPDF pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf") extracted_text = [] # Process each page in the PDF for page_number in range(len(pdf_document)): page = pdf_document[page_number] # Render the page as an image pix = page.get_pixmap() image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") # Convert Pillow image to NumPy array (for PaddleOCR compatibility) image_np = np.array(image) # Run OCR on the image result = ocr.ocr(image_np, cls=True) for line in result: for word_info in line: extracted_text.append(word_info[1][0]) pdf_document.close() base64DataResp = f"data:application/pdf;base64,{base64Data}" else: return {"error": f"Unsupported file type: {content_type}"} # Combine extracted text full_text = " ".join(extracted_text) # Summarize the extracted text summary = summarize_text(full_text) return { "file_key": file_key, "file_type": content_type, "base64DataResp":base64DataResp, "extracted_text": full_text, "summary": summary } except Exception as e: # Detailed error information error_details = { "error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc() } return {"error": error_details} # Serve the output folder as static files app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output") if __name__ == '__main__': uvicorn.run(app=app)