Spaces:
Sleeping
Sleeping
| import uvicorn | |
| from fastapi.staticfiles import StaticFiles | |
| import hashlib | |
| from enum import Enum | |
| from fastapi import FastAPI,Header, Query,Depends,HTTPException | |
| from paddleocr import PaddleOCR, PPStructure, save_structure_res | |
| from PIL import Image | |
| import io | |
| import numpy as np | |
| import fitz # PyMuPDF for PDF handling | |
| import logging | |
| import boto3 | |
| import openai | |
| import os | |
| import traceback # For detailed traceback of errors | |
| import re | |
| import json | |
| from dotenv import load_dotenv | |
| import uvicorn | |
| import base64 | |
| load_dotenv() | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI(docs_url='/') | |
| use_gpu = False | |
| output_dir = 'output' | |
| # Initialize PaddleOCR | |
| ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
| # AWS S3 Configuration | |
| API_KEY = os.getenv("API_KEY") | |
| AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") | |
| AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") | |
| S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") | |
| # OpenAI Configuration | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| # S3 Client | |
| s3_client = boto3.client( | |
| 's3', | |
| aws_access_key_id=AWS_ACCESS_KEY, | |
| aws_secret_access_key=AWS_SECRET_KEY | |
| ) | |
| # Function to fetch file from S3 | |
| def fetch_file_from_s3_file(file_key): | |
| try: | |
| response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key) | |
| content_type = response['ContentType'] # Retrieve MIME type | |
| file_data = response['Body'].read() | |
| return file_data, content_type # Return file data as BytesIO | |
| except Exception as e: | |
| raise Exception(f"Failed to fetch file from S3: {str(e)}") | |
| # Function to summarize text using OpenAI GPT | |
| def summarize_text(text): | |
| system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details: | |
| Vendor Information: | |
| Vendor Name | |
| Vendor Address | |
| Vendor GST No. | |
| Invoice Details: | |
| Invoice No./Bill No./Consecutive Serial No./Serial No. of Invoice/INVOICE → Considered as InvoiceNo. | |
| Invoice Date/Date/Date of Supply/Bill Date/Issuing Date/Dated → Considered as InvoiceDate (formatted as dd-MMM-yyyy). | |
| Invoice Currency/Currency | |
| Base Amount/Amount | |
| Tax Amount | |
| Total Invoice Amount | |
| Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.) | |
| Billing Party Information: | |
| Invoice Party/Bill To Name/Sold-to-Party/Taxpayer Name/M/s./CB No./Buyer (Bill to)/Billing Party/Customer Name & Address/Name → Considered as BillToName. | |
| Invoice Party to / Bill To Address | |
| Invoice Party to / Bill To GST No. | |
| Shipping and References: | |
| MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo. | |
| Shipping Order | |
| You should extract this data and structure it into a table-like format in the following JSON format: | |
| { | |
| "invoice_headers": { | |
| "VendorName": "", | |
| "VendorAddress": "", | |
| "VendorGSTNo": "", | |
| "InvoiceNo": "", | |
| "InvoiceDate": "", | |
| "InvoiceCurrency": "", | |
| "BaseAmount": "", | |
| "TaxAmount": "", | |
| "TotalInvoiceAmt": "", | |
| "TypeofInvoice": "", | |
| "BillToName": "", | |
| "BillToAddress": "", | |
| "BillToGSTNO": "", | |
| "RefNo": "", | |
| "ShippingOrder": "" | |
| }, | |
| "line_items": [ | |
| { | |
| "Description": "", | |
| "TaxPercentage": "", | |
| "TaxAmount": "", | |
| "Amount": 0 | |
| } | |
| ] | |
| } | |
| Guidelines for Processing: | |
| Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.). | |
| Convert the Invoice Date to the specified dd-MMM-yyyy format. | |
| Use the correct currency and amounts for each invoice field. | |
| For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount. | |
| If certain values are missing or not applicable, leave them empty or set them as null where necessary. | |
| This JSON format will be used to store and manage invoices in a structured and uniform way.""" | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": f"{text}"} | |
| ], | |
| temperature=0.5, | |
| max_tokens=16384 | |
| ) | |
| content = response.choices[0].message.content.strip() | |
| print("Before content:", content) | |
| cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning | |
| cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end | |
| # Step 2: Parse the cleaned content as JSON | |
| #parsed_content = json.loads(cleaned_content) | |
| # Step 3: Print the parsed JSON object | |
| try: | |
| parsed_content = json.loads(cleaned_content) | |
| return parsed_content | |
| except json.JSONDecodeError as e: | |
| print("Error parsing JSON:", e) | |
| # Optionally, print the cleaned content to debug | |
| print("Cleaned content:", cleaned_content) | |
| return None | |
| except Exception as e: | |
| return f"Error in summarization: {str(e)}" | |
| # Dependency to check API Key | |
| def verify_api_key(api_key: str = Header(...)): | |
| if api_key != API_KEY: | |
| raise HTTPException(status_code=401, detail="Invalid API Key") | |
| def read_root(): | |
| return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"} | |
| def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")): | |
| """ | |
| Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT. | |
| """ | |
| try: | |
| # Fetch file from S3 | |
| file_data, content_type = fetch_file_from_s3_file(file_key) | |
| extracted_text = [] | |
| base64Data = base64.b64encode(file_data).decode('utf-8') | |
| # Determine file type based on MIME type | |
| if content_type.startswith("image/"): # Image file | |
| image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly | |
| image_np = np.array(image) # Convert to NumPy array | |
| result = ocr.ocr(image_np, cls=True) | |
| base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}" | |
| # Extract text from OCR results | |
| for line in result: | |
| for word_info in line: | |
| extracted_text.append(word_info[1][0]) | |
| elif content_type == "application/pdf": # PDF file | |
| # Open PDF using PyMuPDF | |
| pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf") | |
| extracted_text = [] | |
| # Process each page in the PDF | |
| for page_number in range(len(pdf_document)): | |
| page = pdf_document[page_number] | |
| # Render the page as an image | |
| pix = page.get_pixmap() | |
| image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") | |
| # Convert Pillow image to NumPy array (for PaddleOCR compatibility) | |
| image_np = np.array(image) | |
| # Run OCR on the image | |
| result = ocr.ocr(image_np, cls=True) | |
| for line in result: | |
| for word_info in line: | |
| extracted_text.append(word_info[1][0]) | |
| pdf_document.close() | |
| base64DataResp = f"data:application/pdf;base64,{base64Data}" | |
| else: | |
| return {"error": f"Unsupported file type: {content_type}"} | |
| # Combine extracted text | |
| full_text = " ".join(extracted_text) | |
| # Summarize the extracted text | |
| summary = summarize_text(full_text) | |
| return { | |
| "file_key": file_key, | |
| "file_type": content_type, | |
| "base64DataResp":base64DataResp, | |
| "extracted_text": full_text, | |
| "summary": summary | |
| } | |
| except Exception as e: | |
| # Detailed error information | |
| error_details = { | |
| "error_type": type(e).__name__, | |
| "error_message": str(e), | |
| "traceback": traceback.format_exc() | |
| } | |
| return {"error": error_details} | |
| # Serve the output folder as static files | |
| app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output") | |
| if __name__ == '__main__': | |
| uvicorn.run(app=app) | |