document-extraction

Sleeping

App Files Files Community

document-extraction / app.py

kmuthudurai

Update app.py

45a2314 verified about 1 year ago

raw

history blame

8.48 kB

	import uvicorn
	from fastapi.staticfiles import StaticFiles
	import hashlib
	from enum import Enum
	from fastapi import FastAPI,Header, Query,Depends,HTTPException
	from paddleocr import PaddleOCR, PPStructure, save_structure_res
	from PIL import Image
	import io
	import numpy as np
	import fitz # PyMuPDF for PDF handling
	import logging

	import boto3
	import openai
	import os
	import traceback # For detailed traceback of errors
	import re
	import json
	from dotenv import load_dotenv
	import uvicorn
	import base64

	load_dotenv()

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI(docs_url='/')
	use_gpu = False
	output_dir = 'output'

	# Initialize PaddleOCR
	ocr = PaddleOCR(use_angle_cls=True, lang='en')

	# AWS S3 Configuration
	API_KEY = os.getenv("API_KEY")
	AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
	AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
	S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")

	# OpenAI Configuration
	openai.api_key = os.getenv("OPENAI_API_KEY")

	# S3 Client
	s3_client = boto3.client(
	's3',
	aws_access_key_id=AWS_ACCESS_KEY,
	aws_secret_access_key=AWS_SECRET_KEY
	)

	# Function to fetch file from S3

	def fetch_file_from_s3_file(file_key):
	try:
	response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
	content_type = response['ContentType'] # Retrieve MIME type
	file_data = response['Body'].read()
	return file_data, content_type # Return file data as BytesIO
	except Exception as e:
	raise Exception(f"Failed to fetch file from S3: {str(e)}")

	# Function to summarize text using OpenAI GPT
	def summarize_text(text):
	system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:

	Vendor Information:

	Vendor Name
	Vendor Address
	Vendor GST No.
	Invoice Details:

	Invoice No./Bill No./Consecutive Serial No./Serial No. of Invoice/INVOICE → Considered as InvoiceNo.
	Invoice Date/Date/Date of Supply/Bill Date/Issuing Date/Dated → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
	Invoice Currency/Currency
	Base Amount/Amount
	Tax Amount
	Total Invoice Amount
	Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
	Billing Party Information:

	Invoice Party/Bill To Name/Sold-to-Party/Taxpayer Name/M/s./CB No./Buyer (Bill to)/Billing Party/Customer Name & Address/Name → Considered as BillToName.
	Invoice Party to / Bill To Address
	Invoice Party to / Bill To GST No.
	Shipping and References:

	MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
	Shipping Order
	You should extract this data and structure it into a table-like format in the following JSON format:
	{
	"invoice_headers": {
	"VendorName": "",
	"VendorAddress": "",
	"VendorGSTNo": "",
	"InvoiceNo": "",
	"InvoiceDate": "",
	"InvoiceCurrency": "",
	"BaseAmount": "",
	"TaxAmount": "",
	"TotalInvoiceAmt": "",
	"TypeofInvoice": "",
	"BillToName": "",
	"BillToAddress": "",
	"BillToGSTNO": "",
	"RefNo": "",
	"ShippingOrder": ""
	},
	"line_items": [
	{
	"Description": "",
	"TaxPercentage": "",
	"TaxAmount": "",
	"Amount": 0
	}
	]
	}
	Guidelines for Processing:

	Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
	Convert the Invoice Date to the specified dd-MMM-yyyy format.
	Use the correct currency and amounts for each invoice field.
	For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
	If certain values are missing or not applicable, leave them empty or set them as null where necessary.
	This JSON format will be used to store and manage invoices in a structured and uniform way."""
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": f"{text}"}
	],
	temperature=0.5,
	max_tokens=16384
	)
	content = response.choices[0].message.content.strip()
	print("Before content:", content)
	cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning
	cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end

	# Step 2: Parse the cleaned content as JSON
	#parsed_content = json.loads(cleaned_content)

	# Step 3: Print the parsed JSON object
	try:
	parsed_content = json.loads(cleaned_content)
	return parsed_content
	except json.JSONDecodeError as e:
	print("Error parsing JSON:", e)
	# Optionally, print the cleaned content to debug
	print("Cleaned content:", cleaned_content)
	return None
	except Exception as e:
	return f"Error in summarization: {str(e)}"
	# Dependency to check API Key
	def verify_api_key(api_key: str = Header(...)):
	if api_key != API_KEY:
	raise HTTPException(status_code=401, detail="Invalid API Key")

	@app.get("/")
	def read_root():
	return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}

	@app.get("/ocr/extraction")
	def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")):
	"""
	Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
	"""
	try:
	# Fetch file from S3
	file_data, content_type = fetch_file_from_s3_file(file_key)

	extracted_text = []
	base64Data = base64.b64encode(file_data).decode('utf-8')
	# Determine file type based on MIME type
	if content_type.startswith("image/"): # Image file
	image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
	image_np = np.array(image) # Convert to NumPy array
	result = ocr.ocr(image_np, cls=True)
	base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
	# Extract text from OCR results
	for line in result:
	for word_info in line:
	extracted_text.append(word_info[1][0])

	elif content_type == "application/pdf": # PDF file
	# Open PDF using PyMuPDF
	pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")

	extracted_text = []

	# Process each page in the PDF
	for page_number in range(len(pdf_document)):
	page = pdf_document[page_number]

	# Render the page as an image
	pix = page.get_pixmap()
	image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

	# Convert Pillow image to NumPy array (for PaddleOCR compatibility)
	image_np = np.array(image)

	# Run OCR on the image
	result = ocr.ocr(image_np, cls=True)
	for line in result:
	for word_info in line:
	extracted_text.append(word_info[1][0])

	pdf_document.close()
	base64DataResp = f"data:application/pdf;base64,{base64Data}"
	else:
	return {"error": f"Unsupported file type: {content_type}"}

	# Combine extracted text
	full_text = " ".join(extracted_text)

	# Summarize the extracted text
	summary = summarize_text(full_text)

	return {
	"file_key": file_key,
	"file_type": content_type,
	"base64DataResp":base64DataResp,
	"extracted_text": full_text,
	"summary": summary
	}

	except Exception as e:
	# Detailed error information
	error_details = {
	"error_type": type(e).__name__,
	"error_message": str(e),
	"traceback": traceback.format_exc()
	}
	return {"error": error_details}

	# Serve the output folder as static files
	app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

	if __name__ == '__main__':
	uvicorn.run(app=app)