Spaces:
Sleeping
Sleeping
File size: 9,179 Bytes
59f9119 ffeeaf2 49834bb ae5ac49 5c635bd 4fe40a7 ffeeaf2 49834bb ffeeaf2 4434125 ffeeaf2 4434125 4fe40a7 ffeeaf2 49834bb 4434125 49834bb 4434125 dca3ec3 59f9119 49834bb 42349f7 49834bb 682c3df 53ec771 4cf79ec 49834bb 553ae8e 5c635bd 53ec771 4434125 53ec771 49834bb dca3ec3 4434125 53ec771 5c635bd 53ec771 5c635bd 49834bb 4fe40a7 49834bb 4ed7c26 49834bb e9cefc0 49834bb 5452b1d 49834bb 6761af7 5452b1d ae5ac49 b5a7867 e9cefc0 5452b1d 49834bb 53ec771 e0f7bfa 24259cd a4b95eb 24259cd ae5ac49 e0f7bfa ae5ac49 78199be ae5ac49 e0f7bfa ae5ac49 7c0c12d 87dfad3 49834bb ae5ac49 49834bb 24259cd ae5ac49 5452b1d 24259cd e9cefc0 ae5ac49 b5a7867 ae5ac49 b5a7867 3a340f2 49834bb 4fe40a7 49834bb 53ec771 4434125 5c635bd 49834bb 5c635bd 4434125 49834bb ae5ac49 4434125 49834bb 467cb4b 49834bb b5a7867 49834bb ae5ac49 49834bb 53ec771 49834bb 14c7f24 49834bb ae5ac49 49834bb b5a7867 49834bb 5c635bd 9bcc761 53ec771 ae5ac49 53ec771 5c635bd 53ec771 5c635bd 49834bb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | import uvicorn
from fastapi.staticfiles import StaticFiles
import hashlib
from enum import Enum
from fastapi import FastAPI, Header, Query, Depends, HTTPException
from pdf2image import convert_from_bytes
import io
import fitz # PyMuPDF for PDF handling
import logging
from pymongo import MongoClient
import boto3
import openai
import os
import traceback # For detailed traceback of errors
import re
import json
from dotenv import load_dotenv
import base64
from bson.objectid import ObjectId
db_client = None
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# MongoDB Configuration
MONGODB_URI = os.getenv("MONGODB_URI")
DATABASE_NAME = os.getenv("DATABASE_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
SCHEMA = os.getenv("SCHEMA")
# Check if environment variables are set
if not MONGODB_URI:
raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
# Initialize MongoDB Connection
db_client = MongoClient(MONGODB_URI)
db = db_client[DATABASE_NAME]
invoice_collection = db[COLLECTION_NAME]
schema_collection = db[SCHEMA]
app = FastAPI(docs_url='/')
use_gpu = False
output_dir = 'output'
@app.on_event("startup")
def startup_db():
try:
db_client.server_info()
logger.info("MongoDB connection successful")
except Exception as e:
logger.error(f"MongoDB connection failed: {str(e)}")
# AWS S3 Configuration
API_KEY = os.getenv("API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
# OpenAI Configuration
openai.api_key = os.getenv("OPENAI_API_KEY")
# S3 Client
s3_client = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_KEY
)
# Function to fetch file from S3
def fetch_file_from_s3(file_key):
try:
response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
content_type = response['ContentType'] # Retrieve MIME type
file_data = response['Body'].read()
return file_data, content_type # Return file data as BytesIO
except Exception as e:
raise Exception(f"Failed to fetch file from S3: {str(e)}")
def extract_pdf_text(file_data):
"""
Extracts text from a PDF file using PyMuPDF (fitz).
"""
try:
pdf_document = fitz.open(stream=file_data, filetype="pdf")
text = "\n".join([page.get_text("text") for page in pdf_document])
pdf_document.close() # Explicitly close the PDF
return text if text.strip() else "" # Handle empty PDFs gracefully
except Exception as e:
logger.error(f"PDF Extraction Error: {e}")
return None
# Function to summarize text using OpenAI GPT
def extract_invoice_data(file_data, content_type, json_schema):
"""
Extracts invoice data from PDFs (text-based) and images using OpenAI's GPT-4o-mini model.
Ensures accurate JSON schema binding.
"""
system_prompt = """You are an expert in invoice data extraction.
Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
Extract the following fields:
1. Line Items: A list containing:
- Product Code
- Description
- Amount (numeric)
2. Tax Amount (if available)
3. Vendor GST (if available)
4. Vendor Name
5. Invoice Date (format: "DD-MMM-YYYY")
6. Total Amount (numeric)
7. Invoice Number (alpha-numeric)
8. Vendor Address
9. Invoice Currency
Ensure that:
- All extracted fields match the invoice.
- If any field is missing, return null instead of hallucinating data.
- Do not generate synthetic values—only extract real information from the image.
"""
base64_images = []
base64DataResp = []
extracted_text = ""
if content_type == "application/pdf":
try:
extracted_text = extract_pdf_text(file_data)
# Store PDF as Base64
base64_pdf = base64.b64encode(file_data).decode('utf-8')
base64DataResp.append(f"data:application/pdf;base64,{base64_pdf}")
images = convert_from_bytes(file_data) # Convert PDF to images
if len(images) > 2:
raise ValueError("PDF contains more than 2 pages.")
for img in images[:2]: # Convert up to 2 pages
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG", dpi=(300, 300))
base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
base64_images.append(f"data:image/png;base64,{base64_encoded}")
except Exception as e:
logger.error(f"Error converting PDF to image: {e}")
return {"error": "Failed to process PDF"}, None
elif content_type.startswith("image/"):
# Handle direct image files
base64_img = base64.b64encode(file_data).decode('utf-8')
base64DataResp.append(f"data:{content_type};base64,{base64_img}")
base64_images.append(f"data:{content_type};base64,{base64_img}")
else:
return {"error": f"Unsupported file type: {content_type}"}
# Prepare OpenAI request
openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": openai_content}
],
response_format={"type": "json_schema", "json_schema": json_schema},
temperature=0.5,
max_tokens=16384
)
parsed_content = json.loads(response.choices[0].message.content.strip())
return parsed_content, base64DataResp
except Exception as e:
logger.error(f"Error in OpenAI processing: {e}")
return {"error": str(e)}, base64DataResp
def get_content_type_from_s3(file_key):
"""Fetch the content type (MIME type) of a file stored in S3."""
try:
response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
except Exception as e:
raise Exception(f"Failed to get content type from S3: {str(e)}")
# Dependency to check API Key
def verify_api_key(api_key: str = Header(...)):
if api_key != API_KEY:
raise HTTPException(status_code=401, detail="Invalid API Key")
@app.get("/")
def read_root():
return {"message": "Welcome to the Invoice Summarization API!"}
@app.get("/ocr/extraction")
def extract_text_from_file(
api_key: str = Depends(verify_api_key),
file_key: str = Query(..., description="S3 file key for the file"),
document_type: str = Query(..., description="Type of document"),
entity_ref_key: str = Query(..., description="Entity Reference Key")
):
"""Extract structured data from a PDF or image stored in S3."""
try:
existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
if existing_document:
existing_document["_id"] = str(existing_document["_id"])
return existing_document
# Fetch JSON schema for the document type
schema_doc = schema_collection.find_one({"document_type": document_type})
if not schema_doc:
raise ValueError("No schema found for the given document type")
json_schema = schema_doc.get("json_schema")
if not json_schema:
raise ValueError("Schema is empty or not properly defined.")
# Retrieve file from S3
content_type = get_content_type_from_s3(file_key)
file_data, _ = fetch_file_from_s3(file_key)
# Extract structured data from the document
extracted_data, base64DataResp = extract_invoice_data(file_data, content_type, json_schema)
# Store document in MongoDB
document = {
"file_key": file_key,
"file_type": content_type,
"document_type": document_type,
"entityrefkey": entity_ref_key,
"base64DataResp": base64DataResp,
"extracted_data": extracted_data
}
inserted_doc = invoice_collection.insert_one(document)
document_id = str(inserted_doc.inserted_id)
logger.info(f"Document inserted with ID: {document_id}")
return {
"message": "Document successfully stored in MongoDB",
"document_id": document_id,
"entityrefkey": entity_ref_key,
"base64DataResp": base64DataResp,
"extracted_data": extracted_data
}
except Exception as e:
error_details = {
"error_type": type(e).__name__,
"error_message": str(e),
"traceback": traceback.format_exc()
}
return {"error": error_details}
# Serve the output folder as static files
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
if __name__ == '__main__':
uvicorn.run(app=app) |