vkumartr commited on
Commit
49834bb
·
verified ·
1 Parent(s): 53ec771

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -165
app.py CHANGED
@@ -2,13 +2,12 @@ import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
- from fastapi import FastAPI,Header, Query,Depends,HTTPException
6
- from paddleocr import PaddleOCR, PPStructure, save_structure_res
7
  from PIL import Image
8
  import io
9
- import numpy as np
10
  import fitz # PyMuPDF for PDF handling
11
  import logging
 
12
 
13
  import boto3
14
  import openai
@@ -17,29 +16,43 @@ import traceback # For detailed traceback of errors
17
  import re
18
  import json
19
  from dotenv import load_dotenv
20
- import uvicorn
21
  import base64
 
22
 
 
23
  load_dotenv()
24
 
25
  # Set up logging
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
- #Set up MongoDB
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
- COLLECTION_NAME = os.getenv("COLLECTION_NAME",DocumentExtractionSchema)
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  app = FastAPI(docs_url='/')
35
  use_gpu = False
36
  output_dir = 'output'
37
 
38
- if not MONGODB_URI:
39
- raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
40
-
41
- # Initialize PaddleOCR
42
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
 
 
43
 
44
  # AWS S3 Configuration
45
  API_KEY = os.getenv("API_KEY")
@@ -58,8 +71,7 @@ s3_client = boto3.client(
58
  )
59
 
60
  # Function to fetch file from S3
61
-
62
- def fetch_file_from_s3_file(file_key):
63
  try:
64
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
65
  content_type = response['ContentType'] # Retrieve MIME type
@@ -68,98 +80,95 @@ def fetch_file_from_s3_file(file_key):
68
  except Exception as e:
69
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
70
 
71
- # Function to summarize text using OpenAI GPT
72
- def summarize_text(text):
73
- system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:
74
-
75
- Vendor Information:
76
-
77
- Vendor Name
78
- Vendor Address
79
- Vendor GST No.
80
- Invoice Details:
81
-
82
- Invoice No.
83
- Invoice Date → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
84
- Invoice Currency/Currency
85
- Base Amount/Amount
86
- Tax Amount
87
- Total Invoice Amount
88
- Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
89
- Customer Information:
90
-
91
- Customer Name
92
- Customer Address
93
- Customer GST No.
94
- Shipping and References:
95
-
96
- MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
97
- Shipping Order
98
- You should extract this data and structure it into a table-like format in the following JSON format:
99
- {
100
- "invoice_headers": {
101
- "VendorName": "",
102
- "VendorAddress": "",
103
- "VendorGSTNo": "",
104
- "InvoiceNo": "",
105
- "InvoiceDate": "",
106
- "InvoiceCurrency": "",
107
- "BaseAmount": "",
108
- "TaxAmount": "",
109
- "TotalInvoiceAmt": "",
110
- "TypeofInvoice": "",
111
- "CustomerName": "",
112
- "CustomerAddress": "",
113
- "CustomerGSTNO": "",
114
- "RefNo": "",
115
- "ShippingOrder": ""
116
- },
117
- "line_items": [
118
- {
119
- "Description": "",
120
- "TaxPercentage": "",
121
- "TaxAmount": "",
122
- "Amount": 0
123
- }
124
- ]
125
- }
126
- Guidelines for Processing:
127
-
128
- Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
129
- Convert the Invoice Date to the specified dd-MMM-yyyy format.
130
- Use the correct currency and amounts for each invoice field.
131
- For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
132
- If certain values are missing or not applicable, leave them empty or set them as null where necessary.
133
- This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide."""
134
  try:
135
- response = openai.ChatCompletion.create(
136
- model="gpt-4o-mini",
137
- messages=[
138
- {"role": "system", "content": system_prompt},
139
- {"role": "user", "content": f"{text}"}
140
- ],
141
- temperature=0.5,
142
- max_tokens=16384
143
- )
144
- content = response.choices[0].message.content.strip()
145
- print("Before content:", content)
146
- cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning
147
- cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
148
-
149
- # Step 2: Parse the cleaned content as JSON
150
- #parsed_content = json.loads(cleaned_content)
151
-
152
- # Step 3: Print the parsed JSON object
 
 
 
 
 
 
153
  try:
154
- parsed_content = json.loads(cleaned_content)
155
- return parsed_content
156
- except json.JSONDecodeError as e:
157
- print("Error parsing JSON:", e)
158
- # Optionally, print the cleaned content to debug
159
- print("Cleaned content:", cleaned_content)
160
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  except Exception as e:
162
- return f"Error in summarization: {str(e)}"
 
163
  # Dependency to check API Key
164
  def verify_api_key(api_key: str = Header(...)):
165
  if api_key != API_KEY:
@@ -167,76 +176,68 @@ def verify_api_key(api_key: str = Header(...)):
167
 
168
  @app.get("/")
169
  def read_root():
170
- return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}
171
 
172
  @app.get("/ocr/extraction")
173
- def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file"),document_type: str = Query(...,description="JobDocument Doc_Type"),entityrefkey: str = Query(..., description="JobDocument JOD_PK")):
174
- """
175
- Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
176
- """
 
 
 
177
  try:
178
- # Fetch file from S3
179
- file_data, content_type = fetch_file_from_s3_file(file_key)
180
-
181
- extracted_text = []
182
- base64Data = base64.b64encode(file_data).decode('utf-8')
183
- # Determine file type based on MIME type
184
- if content_type.startswith("image/"): # Image file
185
- image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
186
- image_np = np.array(image) # Convert to NumPy array
187
- result = ocr.ocr(image_np, cls=True)
188
- base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
189
- # Extract text from OCR results
190
- for line in result:
191
- for word_info in line:
192
- extracted_text.append(word_info[1][0])
193
-
194
- elif content_type == "application/pdf": # PDF file
195
- # Open PDF using PyMuPDF
196
- pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
197
-
198
- extracted_text = []
199
-
200
- # Process each page in the PDF
201
- for page_number in range(len(pdf_document)):
202
- page = pdf_document[page_number]
203
-
204
- # Render the page as an image
205
- pix = page.get_pixmap()
206
- image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
207
-
208
- # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
209
- image_np = np.array(image)
210
-
211
- # Run OCR on the image
212
- result = ocr.ocr(image_np, cls=True)
213
- for line in result:
214
- for word_info in line:
215
- extracted_text.append(word_info[1][0])
216
-
217
- pdf_document.close()
218
- base64DataResp = f"data:application/pdf;base64,{base64Data}"
219
- else:
220
- return {"error": f"Unsupported file type: {content_type}"}
221
-
222
- # Combine extracted text
223
- full_text = " ".join(extracted_text)
224
-
225
- # Summarize the extracted text
226
- summary = summarize_text(full_text)
227
-
228
- return {
229
  "file_key": file_key,
230
  "file_type": content_type,
231
- "document_type":document_type,
232
- "entityrefkey":entityrefkey,
233
- "base64DataResp":base64DataResp,
234
- "extracted_text": full_text,
235
- "summary": summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  }
237
 
238
  except Exception as e:
239
- # Detailed error information
240
  error_details = {
241
  "error_type": type(e).__name__,
242
  "error_message": str(e),
@@ -248,4 +249,4 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
248
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
249
 
250
  if __name__ == '__main__':
251
- uvicorn.run(app=app)
 
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
+ from fastapi import FastAPI, Header, Query, Depends, HTTPException
 
6
  from PIL import Image
7
  import io
 
8
  import fitz # PyMuPDF for PDF handling
9
  import logging
10
+ from pymongo import MongoClient
11
 
12
  import boto3
13
  import openai
 
16
  import re
17
  import json
18
  from dotenv import load_dotenv
 
19
  import base64
20
+ from bson.objectid import ObjectId
21
 
22
+ db_client = None
23
  load_dotenv()
24
 
25
  # Set up logging
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
+ # MongoDB Configuration
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
+ SCHEMA = os.getenv("SCHEMA")
34
+
35
+ # Check if environment variables are set
36
+ if not MONGODB_URI:
37
+ raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
38
+
39
+ # Initialize MongoDB Connection
40
+ db_client = MongoClient(MONGODB_URI)
41
+ db = db_client[DATABASE_NAME]
42
+ invoice_collection = db[COLLECTION_NAME]
43
+ schema_collection = db[SCHEMA]
44
 
45
  app = FastAPI(docs_url='/')
46
  use_gpu = False
47
  output_dir = 'output'
48
 
49
+ @app.on_event("startup")
50
+ def startup_db():
51
+ try:
52
+ db_client.server_info()
53
+ logger.info("MongoDB connection successful")
54
+ except Exception as e:
55
+ logger.error(f"MongoDB connection failed: {str(e)}")
56
 
57
  # AWS S3 Configuration
58
  API_KEY = os.getenv("API_KEY")
 
71
  )
72
 
73
  # Function to fetch file from S3
74
+ def fetch_file_from_s3(file_key):
 
75
  try:
76
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
77
  content_type = response['ContentType'] # Retrieve MIME type
 
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
+ def extract_pdf_text(file_data):
84
+ """
85
+ Extracts text from a PDF file using PyMuPDF (fitz).
86
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
+ pdf_document = fitz.open(stream=file_data, filetype="pdf")
89
+ text = "\n".join([page.get_text("text") for page in pdf_document])
90
+ return text
91
+ except Exception as e:
92
+ logger.error(f"PDF Extraction Error: {e}")
93
+ return None
94
+
95
+ # Function to summarize text using OpenAI GPT
96
+ def extract_invoice_data(file_data, content_type, json_schema):
97
+ """
98
+ Extracts data from a PDF or image and returns structured JSON based on the provided schema.
99
+ """
100
+ system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
101
+
102
+ # Convert file to Base64
103
+ base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
+ base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
+
106
+ # Handle PDF Extraction & Format to JSON Schema
107
+ if content_type == "application/pdf":
108
+ extracted_text = extract_pdf_text(file_data)
109
+ if not extracted_text:
110
+ return {"error": "Failed to extract text from PDF"}, base64dataresp
111
+
112
  try:
113
+ # Send extracted text to OpenAI for structured JSON conversion
114
+ response = openai.ChatCompletion.create(
115
+ model="gpt-4o-mini",
116
+ messages=[
117
+ {"role": "system", "content": system_prompt},
118
+ {"role": "user", "content": extracted_text}
119
+ ],
120
+ response_format={"type": "json_schema", "json_schema": json_schema},
121
+ temperature=0.5,
122
+ max_tokens=16384
123
+ )
124
+
125
+ parsed_content = json.loads(response.choices[0].message.content.strip())
126
+ return parsed_content, base64dataresp # Return structured JSON
127
+ except Exception as e:
128
+ logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
+ return {"error": str(e)}, base64dataresp
130
+
131
+ # Handle Image Extraction using OpenAI Vision API
132
+ elif content_type.startswith("image/"):
133
+ try:
134
+ response = openai.ChatCompletion.create(
135
+ model="gpt-4o-mini",
136
+ messages=[
137
+ {"role": "system", "content": system_prompt},
138
+ {
139
+ "role": "user",
140
+ "content": [
141
+ {
142
+ "type": "image_url",
143
+ "image_url": {
144
+ "url": f"data:{content_type};base64,{base64_encoded}"
145
+ }
146
+ }
147
+ ]
148
+ }
149
+ ],
150
+ response_format={"type": "json_schema", "json_schema": json_schema},
151
+ temperature=0.5,
152
+ max_tokens=16384
153
+ )
154
+
155
+ parsed_content = json.loads(response.choices[0].message.content.strip())
156
+ return parsed_content, base64dataresp # Return structured JSON
157
+ except Exception as e:
158
+ logger.error(f"Error in OpenAI image processing: {e}")
159
+ return {"error": str(e)}, base64dataresp
160
+
161
+ else:
162
+ raise ValueError(f"Unsupported content type: {content_type}")
163
+
164
+ def get_content_type_from_s3(file_key):
165
+ """Fetch the content type (MIME type) of a file stored in S3."""
166
+ try:
167
+ response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
168
+ return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
169
  except Exception as e:
170
+ raise Exception(f"Failed to get content type from S3: {str(e)}")
171
+
172
  # Dependency to check API Key
173
  def verify_api_key(api_key: str = Header(...)):
174
  if api_key != API_KEY:
 
176
 
177
  @app.get("/")
178
  def read_root():
179
+ return {"message": "Welcome to the Invoice Summarization API!"}
180
 
181
  @app.get("/ocr/extraction")
182
+ def extract_text_from_file(
183
+ api_key: str = Depends(verify_api_key),
184
+ file_key: str = Query(..., description="S3 file key for the file"),
185
+ document_type: str = Query(..., description="Type of document"),
186
+ entity_ref_key: str = Query(..., description="Entity Reference Key")
187
+ ):
188
+ """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
+ existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
+ if existing_document:
192
+ existing_document["_id"] = str(existing_document["_id"])
193
+ return {
194
+ "message": "Document Retrieved from MongoDB.",
195
+ "document": existing_document
196
+ }
197
+
198
+ # Fetch JSON schema for the document type
199
+ schema_doc = schema_collection.find_one({"document_type": document_type})
200
+ if not schema_doc:
201
+ raise ValueError("No schema found for the given document type")
202
+
203
+ json_schema = schema_doc.get("json_schema")
204
+ if not json_schema:
205
+ raise ValueError("Schema is empty or not properly defined.")
206
+
207
+ # Retrieve file from S3
208
+ content_type = get_content_type_from_s3(file_key)
209
+ file_data, _ = fetch_file_from_s3(file_key)
210
+
211
+ # Extract structured data from the document
212
+ extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
+
214
+ # Build and store document in MongoDB
215
+ document = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  "file_key": file_key,
217
  "file_type": content_type,
218
+ "document_type": document_type,
219
+ "base64dataResp": base64dataresp,
220
+ "entityrefkey": entity_ref_key,
221
+ "extracted_data": extracted_data
222
+ }
223
+
224
+ try:
225
+ inserted_doc = invoice_collection.insert_one(document)
226
+ document_id = str(inserted_doc.inserted_id)
227
+ logger.info(f"Document inserted with ID: {document_id}")
228
+ except Exception as e:
229
+ logger.error(f"Error inserting document: {str(e)}")
230
+ raise HTTPException(status_code=500, detail="Error inserting document into MongoDB")
231
+
232
+ return {
233
+ "message": "Document successfully stored in MongoDB",
234
+ "document_id": document_id,
235
+ "entityrefkey": entity_ref_key,
236
+ "base64dataResp": base64dataresp,
237
+ "extracted_data": extracted_data
238
  }
239
 
240
  except Exception as e:
 
241
  error_details = {
242
  "error_type": type(e).__name__,
243
  "error_message": str(e),
 
249
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
250
 
251
  if __name__ == '__main__':
252
+ uvicorn.run(app=app)