vkumartr commited on
Commit
ae5ac49
·
verified ·
1 Parent(s): 49834bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -70
app.py CHANGED
@@ -4,6 +4,7 @@ import hashlib
4
  from enum import Enum
5
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
  from PIL import Image
 
7
  import io
8
  import fitz # PyMuPDF for PDF handling
9
  import logging
@@ -95,71 +96,57 @@ def extract_pdf_text(file_data):
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
- Extracts data from a PDF or image and returns structured JSON based on the provided schema.
 
99
  """
100
  system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
 
101
 
102
- # Convert file to Base64
103
- base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
- base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
-
106
- # Handle PDF Extraction & Format to JSON Schema
107
  if content_type == "application/pdf":
108
- extracted_text = extract_pdf_text(file_data)
109
- if not extracted_text:
110
- return {"error": "Failed to extract text from PDF"}, base64dataresp
111
-
112
  try:
113
- # Send extracted text to OpenAI for structured JSON conversion
114
- response = openai.ChatCompletion.create(
115
- model="gpt-4o-mini",
116
- messages=[
117
- {"role": "system", "content": system_prompt},
118
- {"role": "user", "content": extracted_text}
119
- ],
120
- response_format={"type": "json_schema", "json_schema": json_schema},
121
- temperature=0.5,
122
- max_tokens=16384
123
- )
124
-
125
- parsed_content = json.loads(response.choices[0].message.content.strip())
126
- return parsed_content, base64dataresp # Return structured JSON
127
- except Exception as e:
128
- logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
- return {"error": str(e)}, base64dataresp
130
 
131
- # Handle Image Extraction using OpenAI Vision API
132
- elif content_type.startswith("image/"):
133
- try:
134
- response = openai.ChatCompletion.create(
135
- model="gpt-4o-mini",
136
- messages=[
137
- {"role": "system", "content": system_prompt},
138
- {
139
- "role": "user",
140
- "content": [
141
- {
142
- "type": "image_url",
143
- "image_url": {
144
- "url": f"data:{content_type};base64,{base64_encoded}"
145
- }
146
- }
147
- ]
148
- }
149
- ],
150
- response_format={"type": "json_schema", "json_schema": json_schema},
151
- temperature=0.5,
152
- max_tokens=16384
153
- )
154
-
155
- parsed_content = json.loads(response.choices[0].message.content.strip())
156
- return parsed_content, base64dataresp # Return structured JSON
157
  except Exception as e:
158
- logger.error(f"Error in OpenAI image processing: {e}")
159
- return {"error": str(e)}, base64dataresp
160
 
161
  else:
162
- raise ValueError(f"Unsupported content type: {content_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def get_content_type_from_s3(file_key):
165
  """Fetch the content type (MIME type) of a file stored in S3."""
@@ -185,7 +172,7 @@ def extract_text_from_file(
185
  document_type: str = Query(..., description="Type of document"),
186
  entity_ref_key: str = Query(..., description="Entity Reference Key")
187
  ):
188
- """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
  if existing_document:
@@ -209,38 +196,34 @@ def extract_text_from_file(
209
  file_data, _ = fetch_file_from_s3(file_key)
210
 
211
  # Extract structured data from the document
212
- extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
 
214
- # Build and store document in MongoDB
215
  document = {
216
  "file_key": file_key,
217
  "file_type": content_type,
218
  "document_type": document_type,
219
- "base64dataResp": base64dataresp,
220
  "entityrefkey": entity_ref_key,
221
  "extracted_data": extracted_data
222
  }
223
 
224
- try:
225
- inserted_doc = invoice_collection.insert_one(document)
226
- document_id = str(inserted_doc.inserted_id)
227
- logger.info(f"Document inserted with ID: {document_id}")
228
- except Exception as e:
229
- logger.error(f"Error inserting document: {str(e)}")
230
- raise HTTPException(status_code=500, detail="Error inserting document into MongoDB")
231
 
232
  return {
233
  "message": "Document successfully stored in MongoDB",
234
  "document_id": document_id,
235
  "entityrefkey": entity_ref_key,
236
- "base64dataResp": base64dataresp,
237
  "extracted_data": extracted_data
238
  }
239
 
240
  except Exception as e:
241
  error_details = {
242
- "error_type": type(e).__name__,
243
- "error_message": str(e),
244
  "traceback": traceback.format_exc()
245
  }
246
  return {"error": error_details}
 
4
  from enum import Enum
5
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
  from PIL import Image
7
+ from pdf2image import convert_from_bytes
8
  import io
9
  import fitz # PyMuPDF for PDF handling
10
  import logging
 
96
  # Function to summarize text using OpenAI GPT
97
  def extract_invoice_data(file_data, content_type, json_schema):
98
  """
99
+ Extracts data from a PDF (converted to images) or an image.
100
+ Only PDFs with 1 or 2 pages are allowed.
101
  """
102
  system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
103
+ base64_images = []
104
 
 
 
 
 
 
105
  if content_type == "application/pdf":
 
 
 
 
106
  try:
107
+ images = convert_from_bytes(file_data) # Convert PDF to images
108
+
109
+ if len(images) > 2:
110
+ raise ValueError("PDF contains more than 2 pages. Only PDFs with 1 or 2 pages are supported.")
111
+
112
+ for img in images[:2]: # Convert up to 2 pages
113
+ img_byte_arr = io.BytesIO()
114
+ img.save(img_byte_arr, format="PNG")
115
+ base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
116
+ base64_images.append(f"data:image/png;base64,{base64_encoded}")
117
+
118
+ content_type = "image/png"
 
 
 
 
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ logger.error(f"Error converting PDF to image: {e}")
122
+ return {"error": "Failed to process PDF"}, None
123
 
124
  else:
125
+ # Handle direct image files
126
+ base64_encoded = base64.b64encode(file_data).decode('utf-8')
127
+ base64_images.append(f"data:{content_type};base64,{base64_encoded}")
128
+
129
+ # Prepare OpenAI request
130
+ openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
131
+
132
+ try:
133
+ response = openai.ChatCompletion.create(
134
+ model="gpt-4o-mini",
135
+ messages=[
136
+ {"role": "system", "content": system_prompt},
137
+ {"role": "user", "content": openai_content}
138
+ ],
139
+ response_format={"type": "json_schema", "json_schema": json_schema},
140
+ temperature=0.5,
141
+ max_tokens=16384
142
+ )
143
+
144
+ parsed_content = json.loads(response.choices[0].message.content.strip())
145
+ return parsed_content, base64_images
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error in OpenAI processing: {e}")
149
+ return {"error": str(e)}, base64_images
150
 
151
  def get_content_type_from_s3(file_key):
152
  """Fetch the content type (MIME type) of a file stored in S3."""
 
172
  document_type: str = Query(..., description="Type of document"),
173
  entity_ref_key: str = Query(..., description="Entity Reference Key")
174
  ):
175
+ """Extract structured data from a PDF or image stored in S3."""
176
  try:
177
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
178
  if existing_document:
 
196
  file_data, _ = fetch_file_from_s3(file_key)
197
 
198
  # Extract structured data from the document
199
+ extracted_data, base64_images = extract_invoice_data(file_data, content_type, json_schema)
200
 
201
+ # Store document in MongoDB
202
  document = {
203
  "file_key": file_key,
204
  "file_type": content_type,
205
  "document_type": document_type,
206
+ "base64_images": base64_images,
207
  "entityrefkey": entity_ref_key,
208
  "extracted_data": extracted_data
209
  }
210
 
211
+ inserted_doc = invoice_collection.insert_one(document)
212
+ document_id = str(inserted_doc.inserted_id)
213
+ logger.info(f"Document inserted with ID: {document_id}")
 
 
 
 
214
 
215
  return {
216
  "message": "Document successfully stored in MongoDB",
217
  "document_id": document_id,
218
  "entityrefkey": entity_ref_key,
219
+ "base64_images": base64_images,
220
  "extracted_data": extracted_data
221
  }
222
 
223
  except Exception as e:
224
  error_details = {
225
+ "error_type": type(e).__name__,
226
+ "error_message": str(e),
227
  "traceback": traceback.format_exc()
228
  }
229
  return {"error": error_details}