vkumartr commited on
Commit
4f9f527
·
verified ·
1 Parent(s): 3be35b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -24
app.py CHANGED
@@ -95,24 +95,40 @@ def extract_pdf_text(file_data):
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
- Handles both PDF text extraction (PyMuPDF) and Image OCR using OpenAI GPT.
99
- Returns extracted data along with the base64 representation.
100
  """
101
- system_prompt = "You are an expert in document data extraction."
102
 
103
  # Convert file to Base64
104
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
105
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
106
 
107
- # Handle PDF separately
108
  if content_type == "application/pdf":
109
  extracted_text = extract_pdf_text(file_data)
110
  if not extracted_text:
111
  return {"error": "Failed to extract text from PDF"}, base64dataresp
112
 
113
- return {"extracted_text": extracted_text}, base64dataresp # Return plain text for PDFs
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- # Handle Images using OpenAI
 
 
 
 
 
 
116
  elif content_type.startswith("image/"):
117
  try:
118
  response = openai.ChatCompletion.create(
@@ -131,25 +147,13 @@ def extract_invoice_data(file_data, content_type, json_schema):
131
  ]
132
  }
133
  ],
134
- response_format={
135
- "type": "json_schema",
136
- "json_schema": json_schema
137
- },
138
  temperature=0.5,
139
  max_tokens=16384
140
  )
141
 
142
- # Clean and parse JSON output
143
- content = response.choices[0].message.content.strip()
144
- cleaned_content = content.strip().strip('```json').strip('```')
145
-
146
- try:
147
- parsed_content = json.loads(cleaned_content)
148
- return parsed_content, base64dataresp # Return extracted structured data
149
- except json.JSONDecodeError as e:
150
- logger.error(f"JSON Parse Error: {e}")
151
- return None, base64dataresp
152
-
153
  except Exception as e:
154
  logger.error(f"Error in OpenAI image processing: {e}")
155
  return {"error": str(e)}, base64dataresp
@@ -181,7 +185,7 @@ def extract_text_from_file(
181
  document_type: str = Query(..., description="Type of document"),
182
  entity_ref_key: str = Query(..., description="Entity Reference Key")
183
  ):
184
- """Extract text from a PDF or Image stored in S3 and process it accordingly."""
185
  try:
186
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
187
  if existing_document:
@@ -191,7 +195,7 @@ def extract_text_from_file(
191
  "document": existing_document
192
  }
193
 
194
- # Fetch schema for the document type
195
  schema_doc = schema_collection.find_one({"document_type": document_type})
196
  if not schema_doc:
197
  raise ValueError("No schema found for the given document type")
@@ -204,7 +208,7 @@ def extract_text_from_file(
204
  content_type = get_content_type_from_s3(file_key)
205
  file_data, _ = fetch_file_from_s3(file_key)
206
 
207
- # Extract data from the document (PDF or Image)
208
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
209
 
210
  # Build and store document in MongoDB
 
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  """
98
+ Extracts data from a PDF or image and returns structured JSON based on the provided schema.
 
99
  """
100
+ system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
101
 
102
  # Convert file to Base64
103
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
 
106
+ # Handle PDF Extraction & Format to JSON Schema
107
  if content_type == "application/pdf":
108
  extracted_text = extract_pdf_text(file_data)
109
  if not extracted_text:
110
  return {"error": "Failed to extract text from PDF"}, base64dataresp
111
 
112
+ try:
113
+ # Send extracted text to OpenAI for structured JSON conversion
114
+ response = openai.ChatCompletion.create(
115
+ model="gpt-4o-mini",
116
+ messages=[
117
+ {"role": "system", "content": system_prompt},
118
+ {"role": "user", "content": extracted_text}
119
+ ],
120
+ response_format={"type": "json_schema", "json_schema": json_schema},
121
+ temperature=0.5,
122
+ max_tokens=16384
123
+ )
124
 
125
+ parsed_content = json.loads(response.choices[0].message.content.strip())
126
+ return parsed_content, base64dataresp # Return structured JSON
127
+ except Exception as e:
128
+ logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
+ return {"error": str(e)}, base64dataresp
130
+
131
+ # Handle Image Extraction using OpenAI Vision API
132
  elif content_type.startswith("image/"):
133
  try:
134
  response = openai.ChatCompletion.create(
 
147
  ]
148
  }
149
  ],
150
+ response_format={"type": "json_schema", "json_schema": json_schema},
 
 
 
151
  temperature=0.5,
152
  max_tokens=16384
153
  )
154
 
155
+ parsed_content = json.loads(response.choices[0].message.content.strip())
156
+ return parsed_content, base64dataresp # Return structured JSON
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  logger.error(f"Error in OpenAI image processing: {e}")
159
  return {"error": str(e)}, base64dataresp
 
185
  document_type: str = Query(..., description="Type of document"),
186
  entity_ref_key: str = Query(..., description="Entity Reference Key")
187
  ):
188
+ """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
  if existing_document:
 
195
  "document": existing_document
196
  }
197
 
198
+ # Fetch JSON schema for the document type
199
  schema_doc = schema_collection.find_one({"document_type": document_type})
200
  if not schema_doc:
201
  raise ValueError("No schema found for the given document type")
 
208
  content_type = get_content_type_from_s3(file_key)
209
  file_data, _ = fetch_file_from_s3(file_key)
210
 
211
+ # Extract structured data from the document
212
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
 
214
  # Build and store document in MongoDB