vkumartr commited on
Commit
4fe40a7
·
verified ·
1 Parent(s): 4f9f527

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -110
app.py CHANGED
@@ -1,23 +1,18 @@
1
  import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
- import hashlib
4
- from enum import Enum
5
- from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
- from PIL import Image
7
  import io
8
- import fitz # PyMuPDF for PDF handling
9
  import logging
10
- from pymongo import MongoClient
11
-
12
  import boto3
13
  import openai
14
  import os
15
- import traceback # For detailed traceback of errors
16
- import re
17
  import json
18
- from dotenv import load_dotenv
19
  import base64
20
- from bson.objectid import ObjectId
 
 
 
21
 
22
  db_client = None
23
  load_dotenv()
@@ -32,19 +27,15 @@ DATABASE_NAME = os.getenv("DATABASE_NAME")
32
  COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
  SCHEMA = os.getenv("SCHEMA")
34
 
35
- # Check if environment variables are set
36
  if not MONGODB_URI:
37
  raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
38
 
39
- # Initialize MongoDB Connection
40
  db_client = MongoClient(MONGODB_URI)
41
  db = db_client[DATABASE_NAME]
42
  invoice_collection = db[COLLECTION_NAME]
43
  schema_collection = db[SCHEMA]
44
 
45
  app = FastAPI(docs_url='/')
46
- use_gpu = False
47
- output_dir = 'output'
48
 
49
  @app.on_event("startup")
50
  def startup_db():
@@ -63,114 +54,86 @@ S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
63
  # OpenAI Configuration
64
  openai.api_key = os.getenv("OPENAI_API_KEY")
65
 
66
- # S3 Client
67
  s3_client = boto3.client(
68
  's3',
69
  aws_access_key_id=AWS_ACCESS_KEY,
70
  aws_secret_access_key=AWS_SECRET_KEY
71
  )
72
 
73
- # Function to fetch file from S3
74
  def fetch_file_from_s3(file_key):
 
75
  try:
76
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
77
- content_type = response['ContentType'] # Retrieve MIME type
78
  file_data = response['Body'].read()
79
- return file_data, content_type # Return file data as BytesIO
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
- def extract_pdf_text(file_data):
84
- """
85
- Extracts text from a PDF file using PyMuPDF (fitz).
86
- """
87
- try:
88
- pdf_document = fitz.open(stream=file_data, filetype="pdf")
89
- text = "\n".join([page.get_text("text") for page in pdf_document])
90
- return text
91
- except Exception as e:
92
- logger.error(f"PDF Extraction Error: {e}")
93
- return None
94
-
95
- # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
- Extracts data from a PDF or image and returns structured JSON based on the provided schema.
 
99
  """
100
  system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
 
101
 
102
- # Convert file to Base64
103
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
- base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
-
106
- # Handle PDF Extraction & Format to JSON Schema
107
  if content_type == "application/pdf":
108
- extracted_text = extract_pdf_text(file_data)
109
- if not extracted_text:
110
- return {"error": "Failed to extract text from PDF"}, base64dataresp
111
-
112
  try:
113
- # Send extracted text to OpenAI for structured JSON conversion
114
- response = openai.ChatCompletion.create(
115
- model="gpt-4o-mini",
116
- messages=[
117
- {"role": "system", "content": system_prompt},
118
- {"role": "user", "content": extracted_text}
119
- ],
120
- response_format={"type": "json_schema", "json_schema": json_schema},
121
- temperature=0.5,
122
- max_tokens=16384
123
- )
124
-
125
- parsed_content = json.loads(response.choices[0].message.content.strip())
126
- return parsed_content, base64dataresp # Return structured JSON
127
- except Exception as e:
128
- logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
- return {"error": str(e)}, base64dataresp
130
 
131
- # Handle Image Extraction using OpenAI Vision API
132
- elif content_type.startswith("image/"):
133
- try:
134
- response = openai.ChatCompletion.create(
135
- model="gpt-4o-mini",
136
- messages=[
137
- {"role": "system", "content": system_prompt},
138
- {
139
- "role": "user",
140
- "content": [
141
- {
142
- "type": "image_url",
143
- "image_url": {
144
- "url": f"data:{content_type};base64,{base64_encoded}"
145
- }
146
- }
147
- ]
148
- }
149
- ],
150
- response_format={"type": "json_schema", "json_schema": json_schema},
151
- temperature=0.5,
152
- max_tokens=16384
153
- )
154
-
155
- parsed_content = json.loads(response.choices[0].message.content.strip())
156
- return parsed_content, base64dataresp # Return structured JSON
157
  except Exception as e:
158
- logger.error(f"Error in OpenAI image processing: {e}")
159
- return {"error": str(e)}, base64dataresp
160
 
161
  else:
162
- raise ValueError(f"Unsupported content type: {content_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def get_content_type_from_s3(file_key):
165
- """Fetch the content type (MIME type) of a file stored in S3."""
166
  try:
167
  response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
168
- return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
169
  except Exception as e:
170
  raise Exception(f"Failed to get content type from S3: {str(e)}")
171
 
172
- # Dependency to check API Key
173
  def verify_api_key(api_key: str = Header(...)):
 
174
  if api_key != API_KEY:
175
  raise HTTPException(status_code=401, detail="Invalid API Key")
176
 
@@ -185,7 +148,7 @@ def extract_text_from_file(
185
  document_type: str = Query(..., description="Type of document"),
186
  entity_ref_key: str = Query(..., description="Entity Reference Key")
187
  ):
188
- """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
  if existing_document:
@@ -209,43 +172,34 @@ def extract_text_from_file(
209
  file_data, _ = fetch_file_from_s3(file_key)
210
 
211
  # Extract structured data from the document
212
- extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
 
214
- # Build and store document in MongoDB
215
  document = {
216
  "file_key": file_key,
217
  "file_type": content_type,
218
  "document_type": document_type,
219
- "base64dataResp": base64dataresp,
220
  "entityrefkey": entity_ref_key,
221
  "extracted_data": extracted_data
222
  }
223
 
224
- try:
225
- inserted_doc = invoice_collection.insert_one(document)
226
- document_id = str(inserted_doc.inserted_id)
227
- logger.info(f"Document inserted with ID: {document_id}")
228
- except Exception as e:
229
- logger.error(f"Error inserting document: {str(e)}")
230
- raise HTTPException(status_code=500, detail="Error inserting document into MongoDB")
231
 
232
  return {
233
  "message": "Document successfully stored in MongoDB",
234
  "document_id": document_id,
235
  "entityrefkey": entity_ref_key,
236
- "base64dataResp": base64dataresp,
237
  "extracted_data": extracted_data
238
  }
239
 
240
  except Exception as e:
241
- error_details = {
242
- "error_type": type(e).__name__,
243
- "error_message": str(e),
244
- "traceback": traceback.format_exc()
245
- }
246
  return {"error": error_details}
247
-
248
- # Serve the output folder as static files
249
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
250
 
251
  if __name__ == '__main__':
 
1
  import uvicorn
2
  from fastapi.staticfiles import StaticFiles
 
 
 
 
3
  import io
 
4
  import logging
5
+ import fitz # PyMuPDF for PDF handling
 
6
  import boto3
7
  import openai
8
  import os
9
+ import traceback
 
10
  import json
 
11
  import base64
12
+ from pdf2image import convert_from_bytes
13
+ from fastapi import FastAPI, Header, Query, Depends, HTTPException
14
+ from pymongo import MongoClient
15
+ from dotenv import load_dotenv
16
 
17
  db_client = None
18
  load_dotenv()
 
27
  COLLECTION_NAME = os.getenv("COLLECTION_NAME")
28
  SCHEMA = os.getenv("SCHEMA")
29
 
 
30
  if not MONGODB_URI:
31
  raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
32
 
 
33
  db_client = MongoClient(MONGODB_URI)
34
  db = db_client[DATABASE_NAME]
35
  invoice_collection = db[COLLECTION_NAME]
36
  schema_collection = db[SCHEMA]
37
 
38
  app = FastAPI(docs_url='/')
 
 
39
 
40
  @app.on_event("startup")
41
  def startup_db():
 
54
  # OpenAI Configuration
55
  openai.api_key = os.getenv("OPENAI_API_KEY")
56
 
 
57
  s3_client = boto3.client(
58
  's3',
59
  aws_access_key_id=AWS_ACCESS_KEY,
60
  aws_secret_access_key=AWS_SECRET_KEY
61
  )
62
 
 
63
  def fetch_file_from_s3(file_key):
64
+ """Retrieve file from S3"""
65
  try:
66
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
67
+ content_type = response['ContentType']
68
  file_data = response['Body'].read()
69
+ return file_data, content_type
70
  except Exception as e:
71
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def extract_invoice_data(file_data, content_type, json_schema):
74
  """
75
+ Extracts data from a PDF (converted to images) or an image.
76
+ Only PDFs with 1 or 2 pages are allowed.
77
  """
78
  system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
79
+ base64_images = []
80
 
 
 
 
 
 
81
  if content_type == "application/pdf":
 
 
 
 
82
  try:
83
+ images = convert_from_bytes(file_data) # Convert PDF to images
84
+
85
+ if len(images) > 2:
86
+ raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
87
+
88
+ for img in images[:2]: # Convert up to 2 pages
89
+ img_byte_arr = io.BytesIO()
90
+ img.save(img_byte_arr, format="PNG")
91
+ base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
92
+ base64_images.append(f"data:image/png;base64,{base64_encoded}")
93
+
94
+ content_type = "image/png"
 
 
 
 
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  except Exception as e:
97
+ logger.error(f"Error converting PDF to image: {e}")
98
+ return {"error": "Failed to process PDF"}, None
99
 
100
  else:
101
+ # Handle direct image files
102
+ base64_encoded = base64.b64encode(file_data).decode('utf-8')
103
+ base64_images.append(f"data:{content_type};base64,{base64_encoded}")
104
+
105
+ # Prepare OpenAI request
106
+ openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
107
+
108
+ try:
109
+ response = openai.ChatCompletion.create(
110
+ model="gpt-4o-mini",
111
+ messages=[
112
+ {"role": "system", "content": system_prompt},
113
+ {"role": "user", "content": openai_content}
114
+ ],
115
+ response_format={"type": "json_schema", "json_schema": json_schema},
116
+ temperature=0.5,
117
+ max_tokens=16384
118
+ )
119
+
120
+ parsed_content = json.loads(response.choices[0].message.content.strip())
121
+ return parsed_content, base64_images
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error in OpenAI processing: {e}")
125
+ return {"error": str(e)}, base64_images
126
 
127
  def get_content_type_from_s3(file_key):
128
+ """Fetch MIME type of a file from S3"""
129
  try:
130
  response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
131
+ return response.get('ContentType', 'application/octet-stream')
132
  except Exception as e:
133
  raise Exception(f"Failed to get content type from S3: {str(e)}")
134
 
 
135
  def verify_api_key(api_key: str = Header(...)):
136
+ """Verify API Key"""
137
  if api_key != API_KEY:
138
  raise HTTPException(status_code=401, detail="Invalid API Key")
139
 
 
148
  document_type: str = Query(..., description="Type of document"),
149
  entity_ref_key: str = Query(..., description="Entity Reference Key")
150
  ):
151
+ """Extract structured data from a PDF or image stored in S3."""
152
  try:
153
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
154
  if existing_document:
 
172
  file_data, _ = fetch_file_from_s3(file_key)
173
 
174
  # Extract structured data from the document
175
+ extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
176
 
177
+ # Store document in MongoDB
178
  document = {
179
  "file_key": file_key,
180
  "file_type": content_type,
181
  "document_type": document_type,
182
+ "baseDataResp": base64_images,
183
  "entityrefkey": entity_ref_key,
184
  "extracted_data": extracted_data
185
  }
186
 
187
+ inserted_doc = invoice_collection.insert_one(document)
188
+ document_id = str(inserted_doc.inserted_id)
189
+ logger.info(f"Document inserted with ID: {document_id}")
 
 
 
 
190
 
191
  return {
192
  "message": "Document successfully stored in MongoDB",
193
  "document_id": document_id,
194
  "entityrefkey": entity_ref_key,
195
+ "baseDataResp": base64_images,
196
  "extracted_data": extracted_data
197
  }
198
 
199
  except Exception as e:
200
+ error_details = {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc()}
 
 
 
 
201
  return {"error": error_details}
202
+
 
203
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
204
 
205
  if __name__ == '__main__':