vkumartr commited on
Commit
09424c9
·
verified ·
1 Parent(s): 35fd719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -29
app.py CHANGED
@@ -95,40 +95,24 @@ def extract_pdf_text(file_data):
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
- Extracts data from a PDF or image and returns structured JSON based on the provided schema.
 
99
  """
100
- system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
101
 
102
  # Convert file to Base64
103
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
 
106
- # Handle PDF Extraction & Format to JSON Schema
107
  if content_type == "application/pdf":
108
  extracted_text = extract_pdf_text(file_data)
109
  if not extracted_text:
110
  return {"error": "Failed to extract text from PDF"}, base64dataresp
111
 
112
- try:
113
- # Send extracted text to OpenAI for structured JSON conversion
114
- response = openai.ChatCompletion.create(
115
- model="gpt-4o-mini",
116
- messages=[
117
- {"role": "system", "content": system_prompt},
118
- {"role": "user", "content": extracted_text}
119
- ],
120
- response_format={"type": "json_schema", "json_schema": json_schema},
121
- temperature=0.5,
122
- max_tokens=16384
123
- )
124
-
125
- parsed_content = json.loads(response.choices[0].message.content.strip())
126
- return parsed_content, base64dataresp # Return structured JSON
127
- except Exception as e:
128
- logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
- return {"error": str(e)}, base64dataresp
130
 
131
- # Handle Image Extraction using OpenAI Vision API
132
  elif content_type.startswith("image/"):
133
  try:
134
  response = openai.ChatCompletion.create(
@@ -147,13 +131,25 @@ def extract_invoice_data(file_data, content_type, json_schema):
147
  ]
148
  }
149
  ],
150
- response_format={"type": "json_schema", "json_schema": json_schema},
 
 
 
151
  temperature=0.5,
152
  max_tokens=16384
153
  )
154
 
155
- parsed_content = json.loads(response.choices[0].message.content.strip())
156
- return parsed_content, base64dataresp # Return structured JSON
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  logger.error(f"Error in OpenAI image processing: {e}")
159
  return {"error": str(e)}, base64dataresp
@@ -185,7 +181,7 @@ def extract_text_from_file(
185
  document_type: str = Query(..., description="Type of document"),
186
  entity_ref_key: str = Query(..., description="Entity Reference Key")
187
  ):
188
- """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
  if existing_document:
@@ -195,7 +191,7 @@ def extract_text_from_file(
195
  "document": existing_document
196
  }
197
 
198
- # Fetch JSON schema for the document type
199
  schema_doc = schema_collection.find_one({"document_type": document_type})
200
  if not schema_doc:
201
  raise ValueError("No schema found for the given document type")
@@ -208,7 +204,7 @@ def extract_text_from_file(
208
  content_type = get_content_type_from_s3(file_key)
209
  file_data, _ = fetch_file_from_s3(file_key)
210
 
211
- # Extract structured data from the document
212
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
 
214
  # Build and store document in MongoDB
@@ -233,7 +229,6 @@ def extract_text_from_file(
233
  "message": "Document successfully stored in MongoDB",
234
  "document_id": document_id,
235
  "entityrefkey": entity_ref_key,
236
- "base64dataResp": base64dataresp,
237
  "extracted_data": extracted_data
238
  }
239
 
 
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
+ Handles both PDF text extraction (PyMuPDF) and Image OCR using OpenAI GPT.
99
+ Returns extracted data along with the base64 representation.
100
  """
101
+ system_prompt = "You are an expert in document data extraction."
102
 
103
  # Convert file to Base64
104
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
105
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
106
 
107
+ # Handle PDF separately
108
  if content_type == "application/pdf":
109
  extracted_text = extract_pdf_text(file_data)
110
  if not extracted_text:
111
  return {"error": "Failed to extract text from PDF"}, base64dataresp
112
 
113
+ return {"extracted_text": extracted_text}, base64dataresp # Return plain text for PDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # Handle Images using OpenAI
116
  elif content_type.startswith("image/"):
117
  try:
118
  response = openai.ChatCompletion.create(
 
131
  ]
132
  }
133
  ],
134
+ response_format={
135
+ "type": "json_schema",
136
+ "json_schema": json_schema
137
+ },
138
  temperature=0.5,
139
  max_tokens=16384
140
  )
141
 
142
+ # Clean and parse JSON output
143
+ content = response.choices[0].message.content.strip()
144
+ cleaned_content = content.strip().strip('```json').strip('```')
145
+
146
+ try:
147
+ parsed_content = json.loads(cleaned_content)
148
+ return parsed_content, base64dataresp # Return extracted structured data
149
+ except json.JSONDecodeError as e:
150
+ logger.error(f"JSON Parse Error: {e}")
151
+ return None, base64dataresp
152
+
153
  except Exception as e:
154
  logger.error(f"Error in OpenAI image processing: {e}")
155
  return {"error": str(e)}, base64dataresp
 
181
  document_type: str = Query(..., description="Type of document"),
182
  entity_ref_key: str = Query(..., description="Entity Reference Key")
183
  ):
184
+ """Extract text from a PDF or Image stored in S3 and process it accordingly."""
185
  try:
186
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
187
  if existing_document:
 
191
  "document": existing_document
192
  }
193
 
194
+ # Fetch schema for the document type
195
  schema_doc = schema_collection.find_one({"document_type": document_type})
196
  if not schema_doc:
197
  raise ValueError("No schema found for the given document type")
 
204
  content_type = get_content_type_from_s3(file_key)
205
  file_data, _ = fetch_file_from_s3(file_key)
206
 
207
+ # Extract data from the document (PDF or Image)
208
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
209
 
210
  # Build and store document in MongoDB
 
229
  "message": "Document successfully stored in MongoDB",
230
  "document_id": document_id,
231
  "entityrefkey": entity_ref_key,
 
232
  "extracted_data": extracted_data
233
  }
234