vkumartr commited on
Commit
53ec771
·
verified ·
1 Parent(s): ffeeaf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -134
app.py CHANGED
@@ -2,12 +2,13 @@ import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
- from fastapi import FastAPI, Header, Query, Depends, HTTPException
 
6
  from PIL import Image
7
  import io
 
8
  import fitz # PyMuPDF for PDF handling
9
  import logging
10
- from pymongo import MongoClient
11
 
12
  import boto3
13
  import openai
@@ -16,39 +17,29 @@ import traceback # For detailed traceback of errors
16
  import re
17
  import json
18
  from dotenv import load_dotenv
 
19
  import base64
20
- from bson.objectid import ObjectId
21
 
22
- db_client = None
23
  load_dotenv()
24
 
25
  # Set up logging
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
- # MongoDB Configuration
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
- COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
- SCHEMA = os.getenv("SCHEMA")
34
-
35
- if not MONGODB_URI:
36
- raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
37
-
38
- db_client = MongoClient(MONGODB_URI)
39
- db = db_client[DATABASE_NAME]
40
- invoice_collection = db[COLLECTION_NAME]
41
- schema_collection = db[SCHEMA]
42
 
43
  app = FastAPI(docs_url='/')
 
 
44
 
45
- @app.on_event("startup")
46
- def startup_db():
47
- try:
48
- db_client.server_info()
49
- logger.info("MongoDB connection successful")
50
- except Exception as e:
51
- logger.error(f"MongoDB connection failed: {str(e)}")
52
 
53
  # AWS S3 Configuration
54
  API_KEY = os.getenv("API_KEY")
@@ -59,153 +50,202 @@ S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
59
  # OpenAI Configuration
60
  openai.api_key = os.getenv("OPENAI_API_KEY")
61
 
 
62
  s3_client = boto3.client(
63
  's3',
64
  aws_access_key_id=AWS_ACCESS_KEY,
65
  aws_secret_access_key=AWS_SECRET_KEY
66
  )
67
 
68
- def fetch_file_from_s3(file_key):
69
- """Retrieve file from S3"""
 
70
  try:
71
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
72
- content_type = response['ContentType']
73
  file_data = response['Body'].read()
74
- return file_data, content_type
75
  except Exception as e:
76
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
77
 
78
- def extract_invoice_data(file_data, content_type, json_schema):
79
- """
80
- Extracts data from a PDF (converted to images) or an image.
81
- Only PDFs with 1 or 2 pages are allowed.
82
- """
83
- system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
84
- base64_images = []
85
-
86
- if content_type == "application/pdf":
87
- try:
88
- images = convert_from_bytes(file_data) # Convert PDF to images
89
-
90
- if len(images) > 2:
91
- raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
92
-
93
- for img in images[:2]: # Convert up to 2 pages
94
- img_byte_arr = io.BytesIO()
95
- img.save(img_byte_arr, format="PNG")
96
- base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
97
- base64_images.append(f"data:image/png;base64,{base64_encoded}")
98
-
99
- content_type = "image/png"
100
-
101
- except Exception as e:
102
- logger.error(f"Error converting PDF to image: {e}")
103
- return {"error": "Failed to process PDF"}, None
104
-
105
- else:
106
- # Handle direct image files
107
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
108
- base64_images.append(f"data:{content_type};base64,{base64_encoded}")
109
-
110
- # Prepare OpenAI request
111
- openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
112
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  try:
114
  response = openai.ChatCompletion.create(
115
  model="gpt-4o-mini",
116
  messages=[
117
- {"role": "system", "content": system_prompt},
118
- {"role": "user", "content": openai_content}
119
- ],
120
- response_format={"type": "json_schema", "json_schema": json_schema},
121
- temperature=0.5,
122
- max_tokens=16384
123
  )
 
 
 
 
124
 
125
- parsed_content = json.loads(response.choices[0].message.content.strip())
126
- return parsed_content, base64_images
127
 
 
 
 
 
 
 
 
 
 
128
  except Exception as e:
129
- logger.error(f"Error in OpenAI processing: {e}")
130
- return {"error": str(e)}, base64_images
131
-
132
- def get_content_type_from_s3(file_key):
133
- """Fetch MIME type of a file from S3"""
134
- try:
135
- response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
136
- return response.get('ContentType', 'application/octet-stream')
137
- except Exception as e:
138
- raise Exception(f"Failed to get content type from S3: {str(e)}")
139
-
140
  def verify_api_key(api_key: str = Header(...)):
141
- """Verify API Key"""
142
  if api_key != API_KEY:
143
  raise HTTPException(status_code=401, detail="Invalid API Key")
144
 
145
  @app.get("/")
146
  def read_root():
147
- return {"message": "Welcome to the Invoice Summarization API!"}
148
 
149
  @app.get("/ocr/extraction")
150
- def extract_text_from_file(
151
- api_key: str = Depends(verify_api_key),
152
- file_key: str = Query(..., description="S3 file key for the file"),
153
- document_type: str = Query(..., description="Type of document"),
154
- entity_ref_key: str = Query(..., description="Entity Reference Key")
155
- ):
156
- """Extract structured data from a PDF or image stored in S3."""
157
  try:
158
- existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
159
- if existing_document:
160
- existing_document["_id"] = str(existing_document["_id"])
161
- return {
162
- "message": "Document Retrieved from MongoDB.",
163
- "document": existing_document
164
- }
165
-
166
- # Fetch JSON schema for the document type
167
- schema_doc = schema_collection.find_one({"document_type": document_type})
168
- if not schema_doc:
169
- raise ValueError("No schema found for the given document type")
170
-
171
- json_schema = schema_doc.get("json_schema")
172
- if not json_schema:
173
- raise ValueError("Schema is empty or not properly defined.")
174
-
175
- # Retrieve file from S3
176
- content_type = get_content_type_from_s3(file_key)
177
- file_data, _ = fetch_file_from_s3(file_key)
178
-
179
- # Extract structured data from the document
180
- extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
181
-
182
- # Store document in MongoDB
183
- document = {
184
- "file_key": file_key,
185
- "file_type": content_type,
186
- "document_type": document_type,
187
- "baseDataResp": base64_images,
188
- "entityrefkey": entity_ref_key,
189
- "extracted_data": extracted_data
190
- }
191
-
192
- inserted_doc = invoice_collection.insert_one(document)
193
- document_id = str(inserted_doc.inserted_id)
194
- logger.info(f"Document inserted with ID: {document_id}")
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  return {
197
- "message": "Document successfully stored in MongoDB",
198
- "document_id": document_id,
199
- "entityrefkey": entity_ref_key,
200
- "baseDataResp": base64_images,
201
- "extracted_data": extracted_data
 
 
202
  }
203
 
204
  except Exception as e:
205
- error_details = {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc()}
 
 
 
 
 
206
  return {"error": error_details}
207
-
 
208
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
209
 
210
  if __name__ == '__main__':
211
- uvicorn.run(app=app)
 
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
+ from fastapi import FastAPI,Header, Query,Depends,HTTPException
6
+ from paddleocr import PaddleOCR, PPStructure, save_structure_res
7
  from PIL import Image
8
  import io
9
+ import numpy as np
10
  import fitz # PyMuPDF for PDF handling
11
  import logging
 
12
 
13
  import boto3
14
  import openai
 
17
  import re
18
  import json
19
  from dotenv import load_dotenv
20
+ import uvicorn
21
  import base64
 
22
 
 
23
  load_dotenv()
24
 
25
  # Set up logging
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
+ #Set up MongoDB
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME",DocumentExtractionSchema)
 
 
 
 
 
 
 
 
 
33
 
34
  app = FastAPI(docs_url='/')
35
+ use_gpu = False
36
+ output_dir = 'output'
37
 
38
+ if not MONGODB_URI:
39
+ raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
40
+
41
+ # Initialize PaddleOCR
42
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
 
 
43
 
44
  # AWS S3 Configuration
45
  API_KEY = os.getenv("API_KEY")
 
50
  # OpenAI Configuration
51
  openai.api_key = os.getenv("OPENAI_API_KEY")
52
 
53
+ # S3 Client
54
  s3_client = boto3.client(
55
  's3',
56
  aws_access_key_id=AWS_ACCESS_KEY,
57
  aws_secret_access_key=AWS_SECRET_KEY
58
  )
59
 
60
+ # Function to fetch file from S3
61
+
62
+ def fetch_file_from_s3_file(file_key):
63
  try:
64
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
65
+ content_type = response['ContentType'] # Retrieve MIME type
66
  file_data = response['Body'].read()
67
+ return file_data, content_type # Return file data as BytesIO
68
  except Exception as e:
69
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
70
 
71
+ # Function to summarize text using OpenAI GPT
72
+ def summarize_text(text):
73
+ system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:
74
+
75
+ Vendor Information:
76
+
77
+ Vendor Name
78
+ Vendor Address
79
+ Vendor GST No.
80
+ Invoice Details:
81
+
82
+ Invoice No.
83
+ Invoice Date → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
84
+ Invoice Currency/Currency
85
+ Base Amount/Amount
86
+ Tax Amount
87
+ Total Invoice Amount
88
+ Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
89
+ Customer Information:
90
+
91
+ Customer Name
92
+ Customer Address
93
+ Customer GST No.
94
+ Shipping and References:
95
+
96
+ MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
97
+ Shipping Order
98
+ You should extract this data and structure it into a table-like format in the following JSON format:
99
+ {
100
+ "invoice_headers": {
101
+ "VendorName": "",
102
+ "VendorAddress": "",
103
+ "VendorGSTNo": "",
104
+ "InvoiceNo": "",
105
+ "InvoiceDate": "",
106
+ "InvoiceCurrency": "",
107
+ "BaseAmount": "",
108
+ "TaxAmount": "",
109
+ "TotalInvoiceAmt": "",
110
+ "TypeofInvoice": "",
111
+ "CustomerName": "",
112
+ "CustomerAddress": "",
113
+ "CustomerGSTNO": "",
114
+ "RefNo": "",
115
+ "ShippingOrder": ""
116
+ },
117
+ "line_items": [
118
+ {
119
+ "Description": "",
120
+ "TaxPercentage": "",
121
+ "TaxAmount": "",
122
+ "Amount": 0
123
+ }
124
+ ]
125
+ }
126
+ Guidelines for Processing:
127
+
128
+ Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
129
+ Convert the Invoice Date to the specified dd-MMM-yyyy format.
130
+ Use the correct currency and amounts for each invoice field.
131
+ For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
132
+ If certain values are missing or not applicable, leave them empty or set them as null where necessary.
133
+ This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide."""
134
  try:
135
  response = openai.ChatCompletion.create(
136
  model="gpt-4o-mini",
137
  messages=[
138
+ {"role": "system", "content": system_prompt},
139
+ {"role": "user", "content": f"{text}"}
140
+ ],
141
+ temperature=0.5,
142
+ max_tokens=16384
 
143
  )
144
+ content = response.choices[0].message.content.strip()
145
+ print("Before content:", content)
146
+ cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning
147
+ cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
148
 
149
+ # Step 2: Parse the cleaned content as JSON
150
+ #parsed_content = json.loads(cleaned_content)
151
 
152
+ # Step 3: Print the parsed JSON object
153
+ try:
154
+ parsed_content = json.loads(cleaned_content)
155
+ return parsed_content
156
+ except json.JSONDecodeError as e:
157
+ print("Error parsing JSON:", e)
158
+ # Optionally, print the cleaned content to debug
159
+ print("Cleaned content:", cleaned_content)
160
+ return None
161
  except Exception as e:
162
+ return f"Error in summarization: {str(e)}"
163
+ # Dependency to check API Key
 
 
 
 
 
 
 
 
 
164
  def verify_api_key(api_key: str = Header(...)):
 
165
  if api_key != API_KEY:
166
  raise HTTPException(status_code=401, detail="Invalid API Key")
167
 
168
  @app.get("/")
169
  def read_root():
170
+ return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}
171
 
172
  @app.get("/ocr/extraction")
173
+ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file"),document_type: str = Query(...,description="JobDocument Doc_Type"),entityrefkey: str = Query(..., description="JobDocument JOD_PK")):
174
+ """
175
+ Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
176
+ """
 
 
 
177
  try:
178
+ # Fetch file from S3
179
+ file_data, content_type = fetch_file_from_s3_file(file_key)
180
+
181
+ extracted_text = []
182
+ base64Data = base64.b64encode(file_data).decode('utf-8')
183
+ # Determine file type based on MIME type
184
+ if content_type.startswith("image/"): # Image file
185
+ image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
186
+ image_np = np.array(image) # Convert to NumPy array
187
+ result = ocr.ocr(image_np, cls=True)
188
+ base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
189
+ # Extract text from OCR results
190
+ for line in result:
191
+ for word_info in line:
192
+ extracted_text.append(word_info[1][0])
193
+
194
+ elif content_type == "application/pdf": # PDF file
195
+ # Open PDF using PyMuPDF
196
+ pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
197
+
198
+ extracted_text = []
199
+
200
+ # Process each page in the PDF
201
+ for page_number in range(len(pdf_document)):
202
+ page = pdf_document[page_number]
203
+
204
+ # Render the page as an image
205
+ pix = page.get_pixmap()
206
+ image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
207
+
208
+ # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
209
+ image_np = np.array(image)
210
+
211
+ # Run OCR on the image
212
+ result = ocr.ocr(image_np, cls=True)
213
+ for line in result:
214
+ for word_info in line:
215
+ extracted_text.append(word_info[1][0])
216
+
217
+ pdf_document.close()
218
+ base64DataResp = f"data:application/pdf;base64,{base64Data}"
219
+ else:
220
+ return {"error": f"Unsupported file type: {content_type}"}
221
+
222
+ # Combine extracted text
223
+ full_text = " ".join(extracted_text)
224
+
225
+ # Summarize the extracted text
226
+ summary = summarize_text(full_text)
227
 
228
  return {
229
+ "file_key": file_key,
230
+ "file_type": content_type,
231
+ "document_type":document_type,
232
+ "entityrefkey":entityrefkey,
233
+ "base64DataResp":base64DataResp,
234
+ "extracted_text": full_text,
235
+ "summary": summary
236
  }
237
 
238
  except Exception as e:
239
+ # Detailed error information
240
+ error_details = {
241
+ "error_type": type(e).__name__,
242
+ "error_message": str(e),
243
+ "traceback": traceback.format_exc()
244
+ }
245
  return {"error": error_details}
246
+
247
+ # Serve the output folder as static files
248
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
249
 
250
  if __name__ == '__main__':
251
+ uvicorn.run(app=app)